Skip to content

Commit

Permalink
extract_ligand_protein plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
Brandon Duane Walker authored and Brandon Duane Walker committed May 28, 2024
1 parent 5965ec4 commit 07e3016
Show file tree
Hide file tree
Showing 15 changed files with 370 additions and 0 deletions.
29 changes: 29 additions & 0 deletions utils/extract-ligand-protein-plugin/.bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[bumpversion]
current_version = 0.1.0
commit = False
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
serialize =
{major}.{minor}.{patch}-{release}{dev}
{major}.{minor}.{patch}

[bumpversion:part:release]
optional_value = _
first_value = dev
values =
dev
_

[bumpversion:part:dev]

[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"

[bumpversion:file:VERSION]

[bumpversion:file:README.md]

[bumpversion:file:plugin.json]

[bumpversion:file:src/polus/mm/utils/extract_ligand_protein/__init__.py]
4 changes: 4 additions & 0 deletions utils/extract-ligand-protein-plugin/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.venv
out
tests
__pycache__
1 change: 1 addition & 0 deletions utils/extract-ligand-protein-plugin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
poetry.lock
5 changes: 5 additions & 0 deletions utils/extract-ligand-protein-plugin/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# CHANGELOG

## 0.1.0

Initial release.
27 changes: 27 additions & 0 deletions utils/extract-ligand-protein-plugin/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
FROM condaforge/mambaforge

ENV EXEC_DIR="/opt/executables"
ENV POLUS_LOG="INFO"
RUN mkdir -p ${EXEC_DIR}


# Work directory defined in the base container
# WORKDIR ${EXEC_DIR}

COPY pyproject.toml ${EXEC_DIR}
COPY VERSION ${EXEC_DIR}
COPY README.md ${EXEC_DIR}
COPY CHANGELOG.md ${EXEC_DIR}

# Install needed packages here

RUN pip install filepattern
RUN conda config --add channels conda-forge
RUN conda install mdanalysis


COPY src ${EXEC_DIR}/src

RUN pip3 install ${EXEC_DIR} --no-cache-dir

CMD ["--help"]
12 changes: 12 additions & 0 deletions utils/extract-ligand-protein-plugin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# extract_ligand_protein (0.1.0)

A tool that employs OpenMM to extract ligands and protein from a PDB file
## Options

This plugin takes 3 input arguments and 0 output argument:

| Name | Description | I/O | Type | Default |
|---------------|-------------------------|--------|--------|---------|
| input_pdb_path | Input pdb file path, Type: string, File type: input, Accepted formats: pdb, Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/data/utils/cat_protein.pdb | Input | File | File |
| output_pdb_path | Output pdb file path, Type: string, File type: output, Accepted formats: pdb, Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/reference/utils/ref_cat_pdb.pdb | Input | string | string |
| output_pdb_ligand_path | Output pdb ligand file path, Type: string, File type: output, Accepted formats: sdf | Input | string | string |
1 change: 1 addition & 0 deletions utils/extract-ligand-protein-plugin/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1.0
4 changes: 4 additions & 0 deletions utils/extract-ligand-protein-plugin/build-docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

version=$(<VERSION)
docker build . -t polusai/extract-ligand-protein-plugin:${version}
48 changes: 48 additions & 0 deletions utils/extract-ligand-protein-plugin/ict.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
specVersion: "0.1.0"
name: extract_ligand_protein
version: 0.1.0
container: extract-ligand-protein-plugin
entrypoint:
title: extract_ligand_protein
description: A tool that employs OpenMM to extract ligands and protein from a PDB file
author: Data Scientist
contact: [email protected]
repository:
documentation:
citation:

inputs:
- name: input_pdb_path
required: true
description: Input pdb file path, Type string, File type input, Accepted formats pdb, Example file https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/data/utils/cat_protein.pdb
type: File
format:
uri: edam:format_1476
- name: output_pdb_path
required: true
description: Output pdb file path, Type string, File type output, Accepted formats pdb, Example file https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/reference/utils/ref_cat_pdb.pdb
type: string
defaultValue: system.pdb
format:
uri: edam:format_1476
- name: output_pdb_ligand_path
required: true
description: Output pdb ligand file path, Type string, File type output, Accepted formats sdf
type: string
defaultValue: ligand_system.pdb
format:
uri: edam:format_1476
outputs:
ui:
- key: inputs.input_pdb_path
title: "input_pdb_path: "
description: "Input pdb file path, Type: string, File type: input, Accepted formats: pdb, Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/data/utils/cat_protein.pdb"
type: File
- key: inputs.output_pdb_path
title: "output_pdb_path: "
description: "Output pdb file path, Type: string, File type: output, Accepted formats: pdb, Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/reference/utils/ref_cat_pdb.pdb"
type: string
- key: inputs.output_pdb_ligand_path
title: "output_pdb_ligand_path: "
description: "Output pdb ligand file path, Type: string, File type: output, Accepted formats: sdf"
type: string
32 changes: 32 additions & 0 deletions utils/extract-ligand-protein-plugin/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[tool.poetry]
name = "polus-mm-utils-extract-ligand-protein"
version = "0.1.0"
description = "An awesome function."
authors = ["Data Scientist <[email protected]>"]
readme = "README.md"
packages = [{include = "polus", from = "src"}]

[tool.poetry.dependencies]
python = ">=3.9,<3.12"
typer = "^0.7.0"
mdanalysis = "2.7.0"
cwl-utils = "0.33"
cwltool = "3.1.20240404144621"

[tool.poetry.group.dev.dependencies]
bump2version = "^1.0.1"
pytest = "^7.4"
pytest-sugar = "^0.9.6"
pre-commit = "^3.2.1"
black = "^23.3.0"
mypy = "^1.1.1"
ruff = "^0.0.270"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.pytest.ini_options]
pythonpath = [
"."
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""extract_ligand_protein."""

__version__ = "0.1.0"

from src.polus.mm.utils.extract_ligand_protein import ( # noqa # pylint: disable=unused-import
extract_ligand_protein,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Package entrypoint for the extract_ligand_protein package."""

# Base packages
import logging
from os import environ
from pathlib import Path

import typer
from polus.mm.utils.extract_ligand_protein.extract_ligand_protein import (
extract_all_ligand_protein,
)

logging.basicConfig(
format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s",
datefmt="%d-%b-%y %H:%M:%S",
)
POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "INFO"))
logger = logging.getLogger("polus.mm.utils.extract_ligand_protein.")
logger.setLevel(POLUS_LOG)

app = typer.Typer(help="extract_ligand_protein.")


@app.command()
def main(
input_pdb_path: Path = typer.Option(
...,
"--input_pdb_path",
help="Input pdb file path, Type: string",
),
output_pdb_path: str = typer.Option(
...,
"--output_pdb_path",
help="Output pdb file path, Type: string",
),
output_pdb_ligand_path: str = typer.Option(
...,
"--output_pdb_ligand_path",
help="Output pdb ligand file path, Type: string",
),
) -> None:
"""extract_ligand_protein."""
logger.info(f"input_pdb_path: {input_pdb_path}")
logger.info(f"output_pdb_path: {output_pdb_path}")
logger.info(f"output_pdb_ligand_path: {output_pdb_ligand_path}")

extract_all_ligand_protein(
input_pdb_path=input_pdb_path,
output_pdb_path=output_pdb_path,
output_pdb_ligand_path=output_pdb_ligand_path,
)


if __name__ == "__main__":
app()
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Extract ligand and protein from the PDB file."""

from pathlib import Path

import MDAnalysis


def extract_single_ligand_protein( # noqa: PLR0912
input_pdb_path: Path,
output_pdb_path: Path,
output_pdb_ligand_path: Path,
) -> None:
"""Extract ligand & protein from the PDB file.
Args:
input_pdb_path (Path): The path to the input pdb file
output_pdb_path (Path): The path to the output pdb file
output_pdb_ligand_path (Path): The path to the output pdb ligand file
"""
# Load the PDB file
u = MDAnalysis.Universe(input_pdb_path)

# Get unique residue names
protein_atoms = u.select_atoms("protein") # use simple atom selection when possible

# Create a new Universe with only protein atoms
protein_u = MDAnalysis.Universe.empty(
n_atoms=protein_atoms.n_atoms,
trajectory=True,
) # needed for coordinates
protein_u.atoms = protein_atoms

# duplicate the universe object
dup_u = MDAnalysis.Universe(input_pdb_path)

# now do the same for the ligand, not protein and not water or salts
ligand_atoms = u.select_atoms("not protein")

try:
# guess the bonds, since input PDB may not have bonds
dup_u.atoms.guess_bonds()
except ValueError:
# ValueError: vdw radii for types: AS.
# These can be defined manually using the keyword 'vdwradii'
print("Error: Could not guess bonds. Check the input PDB file.") # noqa: T201

has_bonds = False
try:
len(dup_u.atoms.bonds)
has_bonds = True
except MDAnalysis.exceptions.NoDataError:
print("No bonds found in the PDB file.") # noqa: T201

# Identify water molecules based on the
# connectivity pattern (Oxygen bonded to two Hydrogens)
if has_bonds:
water_indices = set()
for atom in dup_u.atoms: # dont use selection resname == 'HOH',
# pdb file may have different water residue names
h_bonds = 2
if (
atom.name == "O" and len(atom.bonds) == h_bonds
): # if hydrogens are added
bonded_atoms_names = {a.name for a in atom.bonded_atoms}
if bonded_atoms_names == {"H"}: # Check if both bonds are Hydrogens
water_indices.add(atom.index)
water_indices.update([a.index for a in atom.bonded_atoms])

# now want to remove all salts, waters without H
non_bonded = set()
for atom in dup_u.atoms:
if len(atom.bonds) == 0:
non_bonded.add(atom.index)

# Remove water by excluding the water indices
if len(water_indices) > 0:
water_indices_string = " ".join([str(i) for i in water_indices])
ligand_atoms = ligand_atoms.select_atoms(
f"not index {water_indices_string}",
)

# Remove non bonded atoms
if len(non_bonded) > 0:
non_bonded_string = " ".join([str(i) for i in non_bonded])
ligand_atoms = ligand_atoms.select_atoms(f"not index {non_bonded_string}")

ligand_u = MDAnalysis.Universe.empty(
n_atoms=ligand_atoms.n_atoms,
trajectory=True,
) # needed for coordinates
ligand_u.atoms = ligand_atoms

protein_u.atoms.write(str(output_pdb_path))
if len(ligand_u.atoms) > 0: # will crash if no ligand atoms
ligand_u.atoms.write(str(output_pdb_ligand_path))


def extract_all_ligand_protein(input_pdb_path: list[Path], outdir: Path) -> None:
"""extract_ligand_protein.
Args:
input_pdb_path: Input pdb file path
outdir: Output collection.
Returns:
None
"""
for pdb in input_pdb_path:
output_pdb_path = outdir / f"{pdb.stem}_protein.pdb"
output_pdb_ligand_path = outdir / f"{pdb.stem}_ligand.pdb"
extract_single_ligand_protein(
pdb,
output_pdb_path,
output_pdb_ligand_path,
)
1 change: 1 addition & 0 deletions utils/extract-ligand-protein-plugin/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Tests for extract_ligand_protein."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Tests for extract_ligand_protein."""
from pathlib import Path

from polus.mm.utils.extract_ligand_protein.extract_ligand_protein import (
extract_single_ligand_protein,
)


def test_extract_single_ligand_protein() -> None:
"""Test extract_single_ligand_protein."""
# Get the parent directory of the current file
test_dir = Path(__file__).resolve().parent

# Use glob to find all PDB files in the same directory
pdb_files = test_dir.glob("*.pdb")

# Iterate through each PDB file and test extract_single_ligand_protein
for pdb_file in pdb_files:
pdb = Path(pdb_file)
output_pdb_path = Path(f"{pdb.stem}_protein.pdb")
output_pdb_ligand_path = Path(f"{pdb.stem}_ligand.pdb")
extract_single_ligand_protein(pdb, output_pdb_path, output_pdb_ligand_path)
with output_pdb_path.open() as f:
for line in f:
assert not line.startswith("HETATM")

with output_pdb_ligand_path.open() as f:
for line in f:
assert not line.startswith("ATOM")

0 comments on commit 07e3016

Please sign in to comment.