Skip to content

Commit

Permalink
add documentation to extra.storage
Browse files Browse the repository at this point in the history
  • Loading branch information
HellevdM committed Sep 3, 2024
1 parent 99e505d commit d8c03a0
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 52 deletions.
115 changes: 71 additions & 44 deletions qsprpred/extra/data/storage/protein/interfaces/protein_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,38 +10,49 @@


class ProteinStorage(PropertyStorage, ABC):
"""Storage for proteins.
Attributes:
sequenceProp (str): name of the property that contains all protein sequences
proteins (Iterable[StoredProtein]): all proteins in the store
"""

@property
@abstractmethod
def sequenceProp(self) -> str:
"""Get the name of the property that contains all protein sequences."""

@abstractmethod
def add_protein(self, protein: StoredProtein, raise_on_existing=True):
"""
Add a protein to the store.
def add_protein(self, protein: StoredProtein, raise_on_existing=True) -> StoredProtein:
"""Add a protein to the store.
Args:
protein (StoredProtein): protein sequence
raise_on_existing (bool):
raise an exception if the protein already exists in the store
:param protein: protein sequence
:param raise_on_existing: raise an exception if the protein already exists in the store
:return: `StoredProtein` instance of the added protein
Returns:
StoredProtein: instance of the added protein
"""

@property
@abstractmethod
def proteins(self) -> Iterable[StoredProtein]:
"""
Get all proteins in the store.
:return: iterable of `Protein` instances
"""Get all proteins in the store.
Returns:
Iterable[StoredProtein]: iterable of `Protein` instances
"""

@abstractmethod
def getProtein(self, protein_id: str) -> StoredProtein:
"""
Get a protein from the store using its name.
"""Get a protein from the store using its name.
:param protein_id: name of the protein to search
:return: instance of `Protein`
Args:
protein_id (str): name of the protein to search
Returns:
StoredProtein: instance of `Protein`
"""

@abstractmethod
Expand All @@ -57,56 +68,72 @@ def getPCMInfo(self) -> tuple[dict[str, str], dict]:


class DockableStore(ChemStore, ABC):
"""Storage for dockable molecules.
Attributes:
proteins (Iterable[StoredProtein]): all proteins in the store
"""

@abstractmethod
def add_target(self, target: StoredProtein,
raise_on_existing=True) -> StoredProtein:
"""
Add a target to the store.
"""Add a target to the store.
Args:
target (StoredProtein): target protein
raise_on_existing (bool):
raise an exception if the target already exists in the store
:param target: target protein
:param raise_on_existing: raise an exception if the target already exists in the store
:return: `Protein` instance of the added target
Returns:
(StoredProtein): instance of the added target
"""

@abstractmethod
def add_poses(self, mol_id: str, poses: Chem.Mol, target: StoredProtein,
metadata: list[dict[str, Any]] | None = None) -> list[StoredMol]:
"""
Add poses to the store.
:param mol_id: identifier of the molecule to add poses for
:param poses: dictionary of target identifiers and poses
:param target: target protein
:param metadata: additional metadata to store with the poses
:return: Added poses represented as `StoredMol`
"""Add poses to the store.
Args:
mol_id (str): identifier of the molecule to add poses for
poses (Chem.Mol): dictionary of target identifiers and poses
target (StoredProtein): target protein
metadata (list[dict[str, Any]]): additional metadata to store with the poses
Returns:
list[StoredMol]: Added poses represented as `StoredMol`
"""

@abstractmethod
def get_poses(self, mol_id: str, target_id: str) -> list[StoredMol]:
"""
Get poses from the store.
:param mol_id: identifier of the molecule to get poses for
:param target_id: identifier of the target to get poses for
:return:
"""Get poses from the store.
Args:
mol_id (str): identifier of the molecule to get poses for
target_id (str): identifier of the target to get poses for
Returns:
list[StoredMol]: poses for the molecule and target
"""

@abstractmethod
def get_complex_for_pose(self, mol_id: str, target_id: str) -> Chem.Mol:
"""
Get the complex for a pose.
:param mol_id: identifier of the molecule to get the complex for
:param target_id: identifier of the target to get the complex for
:return: tuple of the complex and the target
"""Get the complex for a pose.
Args:
mol_id (str): identifier of the molecule to get the complex for
target_id (str): identifier of the target to get the complex for
Returns:
tuple: complex and the target
"""

@abstractmethod
def get_target(self, target_id: str) -> StoredProtein:
"""
Get a target from the store using its ID.
:param target_id: identifier of the target to search
:return: instance of `Protein`
"""Get a target from the store using its ID.
Args:
target_id (str): identifier of the target to search
Returns:
StoredProtein: instance of `Protein`
"""
16 changes: 14 additions & 2 deletions qsprpred/extra/data/storage/protein/interfaces/storedprotein.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,51 @@


class StoredProtein(ABC):
"""
A protein object
"""A protein object.
Attributes:
id (str): id of the protein
sequence (str): sequence of the protein
props (dict[str, Any]): properties of the protein
representations (Iterable[StoredProtein]): representations of the protein
"""

@property
@abstractmethod
def id(self) -> str:
"""Get the id of the protein."""
pass

@property
@abstractmethod
def sequence(self) -> str | None:
"""Get the sequence of the protein."""
pass

@property
@abstractmethod
def props(self) -> dict[str, Any] | None:
"""Get the properties of the protein."""
pass

@abstractmethod
def as_pdb(self) -> str | None:
"""Return the protein as a PDB file."""
pass

@abstractmethod
def as_fasta(self) -> str | None:
"""Return the protein as a FASTA file."""
pass

def as_rd_mol(self) -> Chem.Mol | None:
"""Return the protein as an RDKit molecule."""
pdb = self.as_pdb()
if pdb is not None:
return Chem.MolFromPDBBlock(self.as_pdb())

@property
@abstractmethod
def representations(self) -> Iterable["StoredProtein"]:
"""Get all representations of the protein."""
pass
85 changes: 79 additions & 6 deletions qsprpred/extra/data/storage/protein/tabular_pcm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@


class TabularProtein(StoredProtein):
"""A protein object that is stored in a tabular format.
Attributes:
id (str): id of the protein
sequence (str): sequence of the protein
props (dict[str, Any]): properties of the protein
representations (Iterable[TabularProtein]): representations of the protein
"""

def __init__(
self,
Expand All @@ -21,12 +29,14 @@ def __init__(
props: dict[str, Any] | None = None,
representations: Iterable["TabularProtein"] | None = None,
) -> None:
"""
Create a new protein instance.
:param parent: parent protein
:param protein_id: identifier of the protein
:param sequence: sequence of the protein
"""Create a new protein instance.
Args:
protein_id (str): identifier of the protein
sequence (str): sequence of the protein
parent (TabularProtein): parent protein
props (dict[str, Any]): properties of the protein
representations (Iterable[TabularProtein]): representations of the protein
"""
self._parent = parent
self._id = protein_id
Expand All @@ -36,28 +46,42 @@ def __init__(

@property
def id(self) -> str:
"""Get the id of the protein."""
return self._id

@property
def sequence(self) -> str | None:
"""Get the sequence of the protein."""
return self._sequence

@property
def props(self) -> dict[str, Any] | None:
"""Get the properties of the protein."""
return self._props

def as_pdb(self) -> str | None:
"""Return the protein as a PDB file."""
return self._props["pdb"] if "pdb" in self._props else None

def as_fasta(self) -> str | None:
"""Return the protein as a FASTA file."""
return self._props["fasta"] if "fasta" in self._props else None

@property
def representations(self) -> Iterable["TabularProtein"]:
"""Get all representations of the protein."""
return self._representations


class TabularProteinStorage(ProteinStorage, PandasDataTable):
"""A storage class for proteins stored in a tabular format.
Attributes:
sequenceCol (str): name of the column that contains all protein sequences
proteinSeqProvider (Callable): function that provides protein
sequenceProp (str): name of the property that contains all protein sequences
proteins (Iterable[TabularProtein]): all proteins in the store
"""

def __init__(
self,
Expand All @@ -75,6 +99,23 @@ def __init__(
store_format: str = "pkl",
parallel_generator: ParallelGenerator | None = None,
):
"""Create a new protein storage instance.
Args:
name (str): name of the storage
df (pd.DataFrame): data frame containing the proteins
sequence_col (str): name of the column that contains all protein sequences
sequence_provider (Callable): function that provides protein
store_dir (str): directory to store the data
overwrite (bool): overwrite the existing data
index_cols (list[str]): columns to use as index
n_jobs (int): number of parallel jobs
chunk_size (int): size of the chunks
protein_col (str): name of the column that contains the protein ids
random_state (int): random state
store_format (str): format to store the data
parallel_generator (ParallelGenerator): parallel generator
"""
super().__init__(
name,
df if df is not None else pd.DataFrame(columns=[sequence_col,
Expand Down Expand Up @@ -158,16 +199,32 @@ def getPCMInfo(self) -> tuple[dict[str, str], dict]:

@property
def sequenceProp(self) -> str:
"""Get the name of the property that contains all protein sequences."""
return self._sequenceCol

def add_protein(self, protein: TabularProtein, raise_on_existing=True):
"""Add a protein to the store.
Args:
protein (TabularProtein): protein sequence
raise_on_existing (bool):
raise an exception if the protein already exists in the store
"""
self.addEntries(
[protein.id],
{prop: [val] for prop, val in protein.props},
raise_on_existing
)

def _make_proteins_from_chunk(self, df: pd.DataFrame) -> list[TabularProtein]:
"""Create a list of proteins from a chunk of the data frame.
Args:
df (pd.DataFrame): chunk of the data frame
Returns:
list[TabularProtein]: list of proteins
"""
ids = df[self.idProp].values
sequences = df[self.sequenceProp].values
props = df.columns.difference([self.idProp, self.sequenceProp])
Expand All @@ -182,12 +239,28 @@ def _make_proteins_from_chunk(self, df: pd.DataFrame) -> list[TabularProtein]:

@property
def proteins(self) -> list[TabularProtein]:
"""Get all proteins in the store.
Returns:
list[TabularProtein]: list of proteins
"""
ret = []
for chunk in self.iterChunks(len(self)):
ret.extend(self._make_proteins_from_chunk(chunk))
return ret

def getProtein(self, protein_id: str) -> TabularProtein:
"""Get a protein from the store using its name.
Args:
protein_id (str): name of the protein to search
Returns:
TabularProtein: instance of `Protein`
Raises:
ValueError: if the protein is not found
"""
df = self.getDF()
protein = df[df[self.idProp] == protein_id]
if protein.empty:
Expand Down

0 comments on commit d8c03a0

Please sign in to comment.