diff --git a/utils/score-pdb-structures-plugin/.bumpversion.cfg b/utils/score-pdb-structures-plugin/.bumpversion.cfg new file mode 100644 index 00000000..90027fb0 --- /dev/null +++ b/utils/score-pdb-structures-plugin/.bumpversion.cfg @@ -0,0 +1,29 @@ +[bumpversion] +current_version = 0.1.0 +commit = False +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:plugin.json] + +[bumpversion:file:src/polus/mm/utils/score_pdb_structures/__init__.py] diff --git a/utils/score-pdb-structures-plugin/.dockerignore b/utils/score-pdb-structures-plugin/.dockerignore new file mode 100644 index 00000000..7c603f81 --- /dev/null +++ b/utils/score-pdb-structures-plugin/.dockerignore @@ -0,0 +1,4 @@ +.venv +out +tests +__pycache__ diff --git a/utils/score-pdb-structures-plugin/.gitignore b/utils/score-pdb-structures-plugin/.gitignore new file mode 100644 index 00000000..c04bc49f --- /dev/null +++ b/utils/score-pdb-structures-plugin/.gitignore @@ -0,0 +1 @@ +poetry.lock diff --git a/utils/score-pdb-structures-plugin/CHANGELOG.md b/utils/score-pdb-structures-plugin/CHANGELOG.md new file mode 100644 index 00000000..b67793f7 --- /dev/null +++ b/utils/score-pdb-structures-plugin/CHANGELOG.md @@ -0,0 +1,5 @@ +# CHANGELOG + +## 0.1.0 + +Initial release. diff --git a/utils/score-pdb-structures-plugin/Dockerfile b/utils/score-pdb-structures-plugin/Dockerfile new file mode 100644 index 00000000..e8ac9249 --- /dev/null +++ b/utils/score-pdb-structures-plugin/Dockerfile @@ -0,0 +1,22 @@ +FROM condaforge/mambaforge + +ENV EXEC_DIR="/opt/executables" +ENV POLUS_LOG="INFO" +RUN mkdir -p ${EXEC_DIR} + + +# Work directory defined in the base container +# WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY CHANGELOG.md ${EXEC_DIR} + +# Install needed packages here + +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +CMD ["--help"] diff --git a/utils/score-pdb-structures-plugin/README.md b/utils/score-pdb-structures-plugin/README.md new file mode 100644 index 00000000..ff982188 --- /dev/null +++ b/utils/score-pdb-structures-plugin/README.md @@ -0,0 +1,18 @@ +# score_pdb_structures (0.1.0) + +Fetches the PDB information from RCSB and scores PDB structures. + +## Options + +This plugin takes 6 input arguments and 2 output argument: + +| Name | Description | I/O | Type | Default | +|---------------|-------------------------|--------|--------|---------| +| input_pdbids | List of input PDBIDs to score, Type: string[], File type: input, Accepted formats: list[string] | Input | ['null', {'type': 'array', 'items': 'string'}] | ['null', {'type': 'array', 'items': 'string'}] | +| min_row | The row min inex, Type: int | Input | int | int | +| max_row | The row max inex, Type: int | Input | int | int | +| output_txt_path | Path to the text dataset file, Type: string, File type: output, Accepted formats: txt | Input | string | string | +| timeout_duration | The maximum time in seconds to wait for a response from the API before timing out, Type: int | Input | int | int | +| max_retries | The maximum number of times to retry the request in case of failure, Type: int | Input | int | int | +| output_txt_path | Path to the txt file | Output | File | File | +| output_pdbids | The selected PDB IDs | Output | {'type': 'array', 'items': 'string'} | {'type': 'array', 'items': 'string'} | diff --git a/utils/score-pdb-structures-plugin/VERSION b/utils/score-pdb-structures-plugin/VERSION new file mode 100644 index 00000000..6e8bf73a --- /dev/null +++ b/utils/score-pdb-structures-plugin/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/utils/score-pdb-structures-plugin/build-docker.sh b/utils/score-pdb-structures-plugin/build-docker.sh new file mode 100755 index 00000000..fa9b1bdc --- /dev/null +++ b/utils/score-pdb-structures-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", "Brandon Walker "] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.13" +typer = "^0.7.0" +sophios = "0.1.4" +pandas = "2.2.2" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pytest = "^7.4" +pytest-sugar = "^0.9.6" +pre-commit = "^3.2.1" +black = "^23.3.0" +mypy = "^1.1.1" +ruff = "^0.0.270" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +pythonpath = [ + "." +] diff --git a/utils/score-pdb-structures-plugin/score_pdb_structures_0@1@0.cwl b/utils/score-pdb-structures-plugin/score_pdb_structures_0@1@0.cwl new file mode 100644 index 00000000..8671e03f --- /dev/null +++ b/utils/score-pdb-structures-plugin/score_pdb_structures_0@1@0.cwl @@ -0,0 +1,135 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 + +class: CommandLineTool + +label: Fetches the PDB information from RCSB and scores PDB structures. + +doc: |- + Fetches the PDB information from RCSB and scores PDB structures. + +baseCommand: ["python", "-m", "polus.mm.utils.score_pdb_structures"] + +hints: + DockerRequirement: + dockerPull: ndonyapour/score_pdb_structures_tool + +requirements: + InlineJavascriptRequirement: {} + + +inputs: + input_pdbids: + label: List of input PDBIDs to score + doc: |- + List of input PDBIDs to score + Type: string[] + File type: input + Accepted formats: list[string] + type: ["null", {"type": "array", "items": "string"}] + format: edam:format_2330 + inputBinding: + prefix: --input_pdbids + default: [] + + min_row: + label: The row min index + doc: |- + The row min inex + Type: int + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --min_row + default: 1 + + max_row: + label: The row max index + doc: |- + The row max inex + Type: int + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --max_row + default: -1 + + output_txt_path: + label: Path to the text dataset file + doc: |- + Path to the text dataset file + Type: string + File type: output + Accepted formats: txt + type: string + format: + - edam:format_2330 + inputBinding: + prefix: --output_txt_path + default: system.log + + timeout_duration: + label: The maximum time in seconds to wait for a response from the API before timing out + doc: |- + The maximum time in seconds to wait for a response from the API before timing out + Type: int + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --timeout_duration + default: 10 + + max_retries: + label: The maximum number of times to retry the request in case of failure + doc: |- + The maximum number of times to retry the request in case of failure + Type: int + type: int? + format: + - edam:format_2330 + inputBinding: + prefix: --max_retries + default: 5 + +outputs: + output_txt_path: + label: Path to the txt file + doc: |- + Path to the txt file + type: File + outputBinding: + glob: $(inputs.output_txt_path) + format: edam:format_2330 + + output_pdbids: + label: The selected PDB IDs + doc: |- + The selected PDB IDs + type: + type: array + items: string + outputBinding: + glob: $(inputs.output_txt_path) + loadContents: true + outputEval: | + ${ + // check if self[0] exists + if (!self[0]) { + return null; + } + var lines = self[0].contents.split("\n"); + // remove black lines + lines = lines.filter(function(line) {return line.trim() !== '';}); + // The outpus file has one line + // The format of the line is as follows: 6x7z,4yo7,5fyr,6ktl,4rxm,4irx,3v16,6b5z,4mio,7d5n + return lines[0].split(",").map(function(item) {return item.trim();}); + } + +$namespaces: + edam: https://edamontology.org/ + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl diff --git a/utils/score-pdb-structures-plugin/src/polus/mm/utils/score_pdb_structures/__init__.py b/utils/score-pdb-structures-plugin/src/polus/mm/utils/score_pdb_structures/__init__.py new file mode 100644 index 00000000..7411db7a --- /dev/null +++ b/utils/score-pdb-structures-plugin/src/polus/mm/utils/score_pdb_structures/__init__.py @@ -0,0 +1,7 @@ +"""score_pdb_structures.""" + +__version__ = "0.1.0" + +from polus.mm.utils.score_pdb_structures.score_pdb_structures import ( # noqa # pylint: disable=unused-import + score_pdb_structures, +) diff --git a/utils/score-pdb-structures-plugin/src/polus/mm/utils/score_pdb_structures/__main__.py b/utils/score-pdb-structures-plugin/src/polus/mm/utils/score_pdb_structures/__main__.py new file mode 100644 index 00000000..38505634 --- /dev/null +++ b/utils/score-pdb-structures-plugin/src/polus/mm/utils/score_pdb_structures/__main__.py @@ -0,0 +1,81 @@ +"""Package entrypoint for the score_pdb_structures package.""" + +# Base packages +import argparse +import logging +from os import environ + +from polus.mm.utils.score_pdb_structures.score_pdb_structures import ( + score_pdb_structures, +) + +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "INFO")) +logger = logging.getLogger("polus.mm.utils.score_pdb_structures.") +logger.setLevel(POLUS_LOG) + + +def main(args: argparse.Namespace) -> None: + """score_pdb_structures.""" + logger.info(f"input_pdbids: {args.input_pdbids}") + logger.info(f"output_txt_path: {args.output_txt_path}") + logger.info(f"min_row: {args.min_row}") + logger.info(f"max_row: {args.max_row}") + logger.info(f"timeout_duration: {args.timeout_duration}") + logger.info(f"max_retries: {args.max_retries}") + + score_pdb_structures( + input_pdbids=args.input_pdbids, + output_txt_path=args.output_txt_path, + min_row=args.min_row, + max_row=args.max_row, + timeout_duration=args.timeout_duration, + max_retries=args.max_retries, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="score_pdb_structures.") + parser.add_argument( + "--input_pdbids", + type=str, + nargs="+", + required=True, + help="List of input PDBIDs to score, Type string[]", + ) + parser.add_argument( + "--output_txt_path", + type=str, + required=True, + help="Path to the text dataset file, Type string", + ) + parser.add_argument( + "--min_row", + type=int, + required=True, + help="The row min index, Type int", + ) + parser.add_argument( + "--max_row", + type=int, + required=True, + help="The row max index, Type int", + ) + parser.add_argument( + "--timeout_duration", + type=int, + required=True, + help="The maximum time to wait for a response from the API before timing out", + ) + parser.add_argument( + "--max_retries", + type=int, + required=True, + help="The maximum number of times to retry the request in case of failure", + ) + + args = parser.parse_args() + main(args) diff --git a/utils/score-pdb-structures-plugin/src/polus/mm/utils/score_pdb_structures/score_pdb_structures.py b/utils/score-pdb-structures-plugin/src/polus/mm/utils/score_pdb_structures/score_pdb_structures.py new file mode 100644 index 00000000..e12a977d --- /dev/null +++ b/utils/score-pdb-structures-plugin/src/polus/mm/utils/score_pdb_structures/score_pdb_structures.py @@ -0,0 +1,212 @@ +"""Fetches the PDB information from RCSB and scores PDB structures.""" +import time +from pathlib import Path +from typing import Any +from typing import Optional + +import pandas as pd +import requests + +SUCCESS = 200 +TOO_MANY_REQUESTS = 429 + + +def fetch_pdb_data( + pdb_id: str, + timeout_duration: int = 10, + max_retries: int = 5, +) -> Optional[dict[str, Any]]: + """Retrieve information for a given PDB ID from the RCSB PDB API. + + Args: + pdb_id (str): The PDB ID of the protein structure. + timeout_duration (int, optional): The maximum time in seconds to wait for + a response from the API before timing out. Defaults to 10. + max_retries (int, optional): The maximum number of times to retry the request + in case of failures. Defaults to 5. + + Returns: + Optional[Dict[str, Any]]: A dictionary containing the relevant data of the + PDB entry, or None if the request fails. + """ + url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" + retries = 0 + + while retries < max_retries: + try: + response = requests.get(url, timeout=timeout_duration) + + if response.status_code == SUCCESS: + data = response.json() + + # Extract relevant data from the JSON response + resolution = data["rcsb_entry_info"].get("resolution_combined", [None])[ + 0 + ] + experimental_method = data["exptl"][0].get("method", "N/A") + citations = len( + data["rcsb_entry_container_identifiers"].get("related_pubmeds", []), + ) + + # Extract r_free and r_work from the refine section + r_free = "N/A" + r_work = "N/A" + if "refine" in data: + refine_data = data["refine"][ + 0 + ] # Assuming we're interested in the first refine entry + r_free = refine_data.get("ls_rfactor_rfree", "N/A") + r_work = refine_data.get("ls_rfactor_rwork", "N/A") + + return { + "PDB ID": pdb_id, + "Resolution": resolution, + "R-Free": r_free, + "R-Work": r_work, + "Experimental Method": experimental_method, + "Citations": citations, + } + elif response.status_code == TOO_MANY_REQUESTS: # noqa: RET505 + # Exponential backoff + wait_time = 2**retries + print( # noqa: T201 + f"Received status code 429. Retrying in {wait_time} seconds...", + ) + time.sleep(wait_time) + retries += 1 + else: + return None + except requests.exceptions.Timeout: + print( # noqa: T201 + f"Request to {url} timed out after {timeout_duration} seconds.", + ) + return None + except requests.exceptions.RequestException as e: + print(f"An error occurred: {e}") # noqa: T201 + return None + return None + + +def score_pdb_entry(entry: dict[str, Any]) -> float: + """Calculate a score for a PDB entry based on multiple criteria. + + Args: + entry (Dict[str, Any]): A dictionary containing the relevant + data of the PDB entry + + Returns: + float: The total score for the PDB entry. + """ + # Define weights for each criterion + weights = { + "Resolution": 0.35, + "R-Free": 0.25, + "R-Work": 0.20, + "Citations": 0.10, + "Experimental Method": 0.10, + } + + # Normalize and compute the score for each criterion + resolution_score = 1 / entry["Resolution"] if entry["Resolution"] else 0 + r_free_score = 1 / entry["R-Free"] if entry["R-Free"] != "N/A" else 0 + r_work_score = 1 / entry["R-Work"] if entry["R-Work"] != "N/A" else 0 + citations_score = entry["Citations"] + + # Score the experimental method, giving preference to X-ray diffraction + experimental_method_score = ( + 1 if entry["Experimental Method"].lower() == "x-ray diffraction" else 0 + ) + + # Combine scores with weights + return ( + weights["Resolution"] * resolution_score + + weights["R-Free"] * r_free_score + + weights["R-Work"] * r_work_score + + weights["Citations"] * citations_score + + weights["Experimental Method"] * experimental_method_score + ) + + +def filter_pdbs( # noqa: PLR0913 + pdb_ids: list[str], + output_txt_path: str, + min_row: int = 1, + max_row: int = -1, + timeout_duration: int = 10, + max_retries: int = 5, +) -> None: + """Filter, score, and sort PDB entries, then save the results to a txt file. + + Args: + pdb_ids (List[str]): A list of PDB IDs to process. + output_txt_path (str): The path to the output file to save the results. + min_row (int, optional): min index of rows. Defaults to 1. + max_row (int, optional): max index of rows. Defaults to -1. + timeout_duration (int, optional): The maximum time in seconds to wait for + a response from the API before timing out. Defaults to 10. + max_retries (int, optional): The maximum number of times to retry the request + in case of failures. Defaults to 5. + + Returns: + None + """ + pdb_info_list: list[dict[str, Any]] = [] + + for pdb_id in pdb_ids: + pdb_info = fetch_pdb_data(pdb_id, timeout_duration, max_retries) + if pdb_info is not None: + pdb_info_list.append(pdb_info) + + # Score each PDB entry + for entry in pdb_info_list: + entry["Score"] = score_pdb_entry(entry) + + # Convert to DataFrame for easy viewing and sorting + df = pd.DataFrame(pdb_info_list) + df = df.sort_values(by="Score", ascending=False) + print(df.shape) # noqa: T201 + + if int(min_row) != 1 or int(max_row) != -1: + # We want to convert to zero-based indices and we also want + # the upper index to be inclusive (i.e. <=) so -1 lower index. + df = df[(int(min_row) - 1) : int(max_row)] + print(df) # noqa: T201 + + # Now restrict to the column we want + with Path.open(Path(output_txt_path), mode="w", encoding="utf-8") as f: + f.write(",".join(df["PDB ID"].dropna().to_list()) + "\n") + + +def score_pdb_structures( # noqa: PLR0913 + input_pdbids: list[str], + output_txt_path: str, + min_row: int = 1, + max_row: int = -1, + timeout_duration: int = 10, + max_retries: int = 5, +) -> None: + """score_pdb_structures. + + Args: + input_pdbids: List of input PDBIDs to score, Type string[], File type input, + Accepted formats list[string] + min_row: The row min inex, Type int + max_row: The row max inex, Type int + output_txt_path: Path to the text dataset file, Type string, File type output, + Accepted formats txt + timeout_duration (int, optional): The maximum time in seconds to wait for + a response from the API before timing out. Defaults to 10. + max_retries (int, optional): The maximum number of times to retry the request + in case of failures. Defaults to 5. + + Returns: + None + """ + filter_pdbs( + input_pdbids, + output_txt_path, + min_row=min_row, + max_row=max_row, + timeout_duration=timeout_duration, + max_retries=max_retries, + ) diff --git a/utils/score-pdb-structures-plugin/tests/__init__.py b/utils/score-pdb-structures-plugin/tests/__init__.py new file mode 100644 index 00000000..fd4b58bb --- /dev/null +++ b/utils/score-pdb-structures-plugin/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for score_pdb_structures.""" diff --git a/utils/score-pdb-structures-plugin/tests/test_score_pdb_structures.py b/utils/score-pdb-structures-plugin/tests/test_score_pdb_structures.py new file mode 100644 index 00000000..9a571f02 --- /dev/null +++ b/utils/score-pdb-structures-plugin/tests/test_score_pdb_structures.py @@ -0,0 +1,65 @@ +"""Tests for score_pdb_structures.""" +from pathlib import Path + +from polus.mm.utils.score_pdb_structures.score_pdb_structures import ( + score_pdb_structures, +) +from sophios.api.pythonapi import Step +from sophios.api.pythonapi import Workflow + + +def test_score_pdb_structures() -> None: + """Test score_pdb_structures.""" + input_pdbids = [ + "1g0i", + "1iev", + "1ptg", + "1y7v", + "2huo", + "2os9", + "2r71", + "2x1i", + "3bxd", + "1aod", + ] + score_pdb_structures(input_pdbids, "output.txt", max_row=1) + + assert Path("output.txt").exists() + + +def test_score_pdb_structures_cwl() -> None: + """Test score_pdb_structures CWL.""" + cwl_file = Path("score_pdb_structures_0@1@0.cwl") + + # Create the step for the CWL file + score_pdb_structures_step = Step(clt_path=cwl_file) + input_pdbids = [ + "1g0i", + "1iev", + "1ptg", + "1y7v", + "2huo", + "2os9", + "2r71", + "2x1i", + "3bxd", + "1aod", + ] + + score_pdb_structures_step.input_pdbids = input_pdbids + score_pdb_structures_step.output_txt_path = "output_scored.txt" + score_pdb_structures_step.max_row = 1 + + # Define the workflow with the step + steps = [score_pdb_structures_step] + filename = "score_pdb_structures" + workflow = Workflow(steps, filename) + + # Run the workflow + workflow.run() + + # Check for the existence of the output file + outdir = Path("outdir") + assert any( + file.name == "output_scored.txt" for file in outdir.rglob("*") + ), "The file output_scored.txt was not found."