extract_ligand_protein plugin

PolusAI · May 28, 2024 · 07e3016 · 07e3016
1 parent 5965ec4
commit 07e3016
Show file tree

Hide file tree

Showing 15 changed files with 370 additions and 0 deletions.
diff --git a/utils/extract-ligand-protein-plugin/.bumpversion.cfg b/utils/extract-ligand-protein-plugin/.bumpversion.cfg
@@ -0,0 +1,29 @@
+[bumpversion]
+current_version = 0.1.0
+commit = False
+tag = False
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{dev}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = _
+first_value = dev
+values = 
+	dev
+	_
+
+[bumpversion:part:dev]
+
+[bumpversion:file:pyproject.toml]
+search = version = "{current_version}"
+replace = version = "{new_version}"
+
+[bumpversion:file:VERSION]
+
+[bumpversion:file:README.md]
+
+[bumpversion:file:plugin.json]
+
+[bumpversion:file:src/polus/mm/utils/extract_ligand_protein/__init__.py]
diff --git a/utils/extract-ligand-protein-plugin/.dockerignore b/utils/extract-ligand-protein-plugin/.dockerignore
@@ -0,0 +1,4 @@
+.venv
+out
+tests
+__pycache__
diff --git a/utils/extract-ligand-protein-plugin/.gitignore b/utils/extract-ligand-protein-plugin/.gitignore
@@ -0,0 +1 @@
+poetry.lock
diff --git a/utils/extract-ligand-protein-plugin/CHANGELOG.md b/utils/extract-ligand-protein-plugin/CHANGELOG.md
@@ -0,0 +1,5 @@
+# CHANGELOG
+
+## 0.1.0
+
+Initial release.
diff --git a/utils/extract-ligand-protein-plugin/Dockerfile b/utils/extract-ligand-protein-plugin/Dockerfile
@@ -0,0 +1,27 @@
+FROM condaforge/mambaforge
+
+ENV EXEC_DIR="/opt/executables"
+ENV POLUS_LOG="INFO"
+RUN mkdir -p ${EXEC_DIR}
+
+
+# Work directory defined in the base container
+# WORKDIR ${EXEC_DIR}
+
+COPY pyproject.toml ${EXEC_DIR}
+COPY VERSION ${EXEC_DIR}
+COPY README.md ${EXEC_DIR}
+COPY CHANGELOG.md ${EXEC_DIR}
+
+# Install needed packages here
+
+RUN pip install filepattern
+RUN conda config --add channels conda-forge
+RUN conda install mdanalysis
+
+
+COPY src ${EXEC_DIR}/src
+
+RUN pip3 install ${EXEC_DIR} --no-cache-dir
+
+CMD ["--help"]
diff --git a/utils/extract-ligand-protein-plugin/README.md b/utils/extract-ligand-protein-plugin/README.md
@@ -0,0 +1,12 @@
+# extract_ligand_protein (0.1.0)
+
+A tool that employs OpenMM to extract ligands and protein from a PDB file
+## Options
+
+This plugin takes 3 input arguments    and 0 output argument:
+
+| Name          | Description             | I/O    | Type   | Default |
+|---------------|-------------------------|--------|--------|---------|
+| input_pdb_path | Input pdb file path, Type: string, File type: input, Accepted formats: pdb, Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/data/utils/cat_protein.pdb | Input | File | File |
+| output_pdb_path | Output pdb file path, Type: string, File type: output, Accepted formats: pdb, Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/reference/utils/ref_cat_pdb.pdb | Input | string | string |
+| output_pdb_ligand_path | Output pdb ligand file path, Type: string, File type: output, Accepted formats: sdf | Input | string | string |
diff --git a/utils/extract-ligand-protein-plugin/VERSION b/utils/extract-ligand-protein-plugin/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/utils/extract-ligand-protein-plugin/build-docker.sh b/utils/extract-ligand-protein-plugin/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+version=$(<VERSION)
+docker build . -t polusai/extract-ligand-protein-plugin:${version}
diff --git a/utils/extract-ligand-protein-plugin/ict.yml b/utils/extract-ligand-protein-plugin/ict.yml
@@ -0,0 +1,48 @@
+specVersion: "0.1.0"
+name: extract_ligand_protein
+version: 0.1.0
+container: extract-ligand-protein-plugin
+entrypoint:
+title: extract_ligand_protein
+description: A tool that employs OpenMM to extract ligands and protein from a PDB file
+author: Data Scientist
+contact: [email protected]
+repository:
+documentation:
+citation:
+
+inputs:
+  - name: input_pdb_path
+    required: true
+    description: Input pdb file path, Type string, File type input, Accepted formats pdb, Example file https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/data/utils/cat_protein.pdb
+    type: File
+    format:
+      uri: edam:format_1476
+  - name: output_pdb_path
+    required: true
+    description: Output pdb file path, Type string, File type output, Accepted formats pdb, Example file https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/reference/utils/ref_cat_pdb.pdb
+    type: string
+    defaultValue: system.pdb
+    format:
+      uri: edam:format_1476
+  - name: output_pdb_ligand_path
+    required: true
+    description: Output pdb ligand file path, Type string, File type output, Accepted formats sdf
+    type: string
+    defaultValue: ligand_system.pdb
+    format:
+      uri: edam:format_1476
+outputs:
+ui:
+  - key: inputs.input_pdb_path
+    title: "input_pdb_path: "
+    description: "Input pdb file path, Type: string, File type: input, Accepted formats: pdb, Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/data/utils/cat_protein.pdb"
+    type: File
+  - key: inputs.output_pdb_path
+    title: "output_pdb_path: "
+    description: "Output pdb file path, Type: string, File type: output, Accepted formats: pdb, Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/reference/utils/ref_cat_pdb.pdb"
+    type: string
+  - key: inputs.output_pdb_ligand_path
+    title: "output_pdb_ligand_path: "
+    description: "Output pdb ligand file path, Type: string, File type: output, Accepted formats: sdf"
+    type: string
diff --git a/utils/extract-ligand-protein-plugin/pyproject.toml b/utils/extract-ligand-protein-plugin/pyproject.toml
@@ -0,0 +1,32 @@
+[tool.poetry]
+name = "polus-mm-utils-extract-ligand-protein"
+version = "0.1.0"
+description = "An awesome function."
+authors = ["Data Scientist <[email protected]>"]
+readme = "README.md"
+packages = [{include = "polus", from = "src"}]
+
+[tool.poetry.dependencies]
+python = ">=3.9,<3.12"
+typer = "^0.7.0"
+mdanalysis = "2.7.0"
+cwl-utils = "0.33"
+cwltool = "3.1.20240404144621"
+
+[tool.poetry.group.dev.dependencies]
+bump2version = "^1.0.1"
+pytest = "^7.4"
+pytest-sugar = "^0.9.6"
+pre-commit = "^3.2.1"
+black = "^23.3.0"
+mypy = "^1.1.1"
+ruff = "^0.0.270"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+pythonpath = [
+  "."
+]
diff --git a/utils/extract-ligand-protein-plugin/src/polus/mm/utils/extract_ligand_protein/__init__.py b/utils/extract-ligand-protein-plugin/src/polus/mm/utils/extract_ligand_protein/__init__.py
@@ -0,0 +1,7 @@
+"""extract_ligand_protein."""
+
+__version__ = "0.1.0"
+
+from src.polus.mm.utils.extract_ligand_protein import (  # noqa # pylint: disable=unused-import
+    extract_ligand_protein,
+)
diff --git a/utils/extract-ligand-protein-plugin/src/polus/mm/utils/extract_ligand_protein/__main__.py b/utils/extract-ligand-protein-plugin/src/polus/mm/utils/extract_ligand_protein/__main__.py
@@ -0,0 +1,55 @@
+"""Package entrypoint for the extract_ligand_protein package."""
+
+# Base packages
+import logging
+from os import environ
+from pathlib import Path
+
+import typer
+from polus.mm.utils.extract_ligand_protein.extract_ligand_protein import (
+    extract_all_ligand_protein,
+)
+
+logging.basicConfig(
+    format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s",
+    datefmt="%d-%b-%y %H:%M:%S",
+)
+POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "INFO"))
+logger = logging.getLogger("polus.mm.utils.extract_ligand_protein.")
+logger.setLevel(POLUS_LOG)
+
+app = typer.Typer(help="extract_ligand_protein.")
+
+
+@app.command()
+def main(
+    input_pdb_path: Path = typer.Option(
+        ...,
+        "--input_pdb_path",
+        help="Input pdb file path, Type: string",
+    ),
+    output_pdb_path: str = typer.Option(
+        ...,
+        "--output_pdb_path",
+        help="Output pdb file path, Type: string",
+    ),
+    output_pdb_ligand_path: str = typer.Option(
+        ...,
+        "--output_pdb_ligand_path",
+        help="Output pdb ligand file path, Type: string",
+    ),
+) -> None:
+    """extract_ligand_protein."""
+    logger.info(f"input_pdb_path: {input_pdb_path}")
+    logger.info(f"output_pdb_path: {output_pdb_path}")
+    logger.info(f"output_pdb_ligand_path: {output_pdb_ligand_path}")
+
+    extract_all_ligand_protein(
+        input_pdb_path=input_pdb_path,
+        output_pdb_path=output_pdb_path,
+        output_pdb_ligand_path=output_pdb_ligand_path,
+    )
+
+
+if __name__ == "__main__":
+    app()
diff --git a/...ligand-protein-plugin/src/polus/mm/utils/extract_ligand_protein/extract_ligand_protein.py b/...ligand-protein-plugin/src/polus/mm/utils/extract_ligand_protein/extract_ligand_protein.py
@@ -0,0 +1,115 @@
+"""Extract ligand and protein from the PDB file."""
+
+from pathlib import Path
+
+import MDAnalysis
+
+
+def extract_single_ligand_protein(  # noqa: PLR0912
+    input_pdb_path: Path,
+    output_pdb_path: Path,
+    output_pdb_ligand_path: Path,
+) -> None:
+    """Extract ligand & protein from the PDB file.
+
+    Args:
+        input_pdb_path (Path): The path to the input pdb file
+        output_pdb_path (Path): The path to the output pdb file
+        output_pdb_ligand_path (Path): The path to the output pdb ligand file
+    """
+    # Load the PDB file
+    u = MDAnalysis.Universe(input_pdb_path)
+
+    # Get unique residue names
+    protein_atoms = u.select_atoms("protein")  # use simple atom selection when possible
+
+    # Create a new Universe with only protein atoms
+    protein_u = MDAnalysis.Universe.empty(
+        n_atoms=protein_atoms.n_atoms,
+        trajectory=True,
+    )  # needed for coordinates
+    protein_u.atoms = protein_atoms
+
+    # duplicate the universe object
+    dup_u = MDAnalysis.Universe(input_pdb_path)
+
+    # now do the same for the ligand, not protein and not water or salts
+    ligand_atoms = u.select_atoms("not protein")
+
+    try:
+        # guess the bonds, since input PDB may not have bonds
+        dup_u.atoms.guess_bonds()
+    except ValueError:
+        # ValueError: vdw radii for types: AS.
+        # These can be defined manually using the keyword 'vdwradii'
+        print("Error: Could not guess bonds. Check the input PDB file.")  # noqa: T201
+
+    has_bonds = False
+    try:
+        len(dup_u.atoms.bonds)
+        has_bonds = True
+    except MDAnalysis.exceptions.NoDataError:
+        print("No bonds found in the PDB file.")  # noqa: T201
+
+    # Identify water molecules based on the
+    # connectivity pattern (Oxygen bonded to two Hydrogens)
+    if has_bonds:
+        water_indices = set()
+        for atom in dup_u.atoms:  # dont use selection resname == 'HOH',
+            # pdb file may have different water residue names
+            h_bonds = 2
+            if (
+                atom.name == "O" and len(atom.bonds) == h_bonds
+            ):  # if hydrogens are added
+                bonded_atoms_names = {a.name for a in atom.bonded_atoms}
+                if bonded_atoms_names == {"H"}:  # Check if both bonds are Hydrogens
+                    water_indices.add(atom.index)
+                    water_indices.update([a.index for a in atom.bonded_atoms])
+
+        # now want to remove all salts, waters without H
+        non_bonded = set()
+        for atom in dup_u.atoms:
+            if len(atom.bonds) == 0:
+                non_bonded.add(atom.index)
+
+        # Remove water by excluding the water indices
+        if len(water_indices) > 0:
+            water_indices_string = " ".join([str(i) for i in water_indices])
+            ligand_atoms = ligand_atoms.select_atoms(
+                f"not index {water_indices_string}",
+            )
+
+        # Remove non bonded atoms
+        if len(non_bonded) > 0:
+            non_bonded_string = " ".join([str(i) for i in non_bonded])
+            ligand_atoms = ligand_atoms.select_atoms(f"not index {non_bonded_string}")
+
+    ligand_u = MDAnalysis.Universe.empty(
+        n_atoms=ligand_atoms.n_atoms,
+        trajectory=True,
+    )  # needed for coordinates
+    ligand_u.atoms = ligand_atoms
+
+    protein_u.atoms.write(str(output_pdb_path))
+    if len(ligand_u.atoms) > 0:  # will crash if no ligand atoms
+        ligand_u.atoms.write(str(output_pdb_ligand_path))
+
+
+def extract_all_ligand_protein(input_pdb_path: list[Path], outdir: Path) -> None:
+    """extract_ligand_protein.
+
+    Args:
+        input_pdb_path: Input pdb file path
+        outdir: Output collection.
+
+    Returns:
+        None
+    """
+    for pdb in input_pdb_path:
+        output_pdb_path = outdir / f"{pdb.stem}_protein.pdb"
+        output_pdb_ligand_path = outdir / f"{pdb.stem}_ligand.pdb"
+        extract_single_ligand_protein(
+            pdb,
+            output_pdb_path,
+            output_pdb_ligand_path,
+        )
diff --git a/utils/extract-ligand-protein-plugin/tests/__init__.py b/utils/extract-ligand-protein-plugin/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for extract_ligand_protein."""
diff --git a/utils/extract-ligand-protein-plugin/tests/test_extract_ligand_protein.py b/utils/extract-ligand-protein-plugin/tests/test_extract_ligand_protein.py
@@ -0,0 +1,29 @@
+"""Tests for extract_ligand_protein."""
+from pathlib import Path
+
+from polus.mm.utils.extract_ligand_protein.extract_ligand_protein import (
+    extract_single_ligand_protein,
+)
+
+
+def test_extract_single_ligand_protein() -> None:
+    """Test extract_single_ligand_protein."""
+    # Get the parent directory of the current file
+    test_dir = Path(__file__).resolve().parent
+
+    # Use glob to find all PDB files in the same directory
+    pdb_files = test_dir.glob("*.pdb")
+
+    # Iterate through each PDB file and test extract_single_ligand_protein
+    for pdb_file in pdb_files:
+        pdb = Path(pdb_file)
+        output_pdb_path = Path(f"{pdb.stem}_protein.pdb")
+        output_pdb_ligand_path = Path(f"{pdb.stem}_ligand.pdb")
+        extract_single_ligand_protein(pdb, output_pdb_path, output_pdb_ligand_path)
+        with output_pdb_path.open() as f:
+            for line in f:
+                assert not line.startswith("HETATM")
+
+        with output_pdb_ligand_path.open() as f:
+            for line in f:
+                assert not line.startswith("ATOM")