diff --git a/qsprpred/extra/data/storage/protein/interfaces/protein_storage.py b/qsprpred/extra/data/storage/protein/interfaces/protein_storage.py index 79768533..37ba69ca 100644 --- a/qsprpred/extra/data/storage/protein/interfaces/protein_storage.py +++ b/qsprpred/extra/data/storage/protein/interfaces/protein_storage.py @@ -10,6 +10,12 @@ class ProteinStorage(PropertyStorage, ABC): + """Storage for proteins. + + Attributes: + sequenceProp (str): name of the property that contains all protein sequences + proteins (Iterable[StoredProtein]): all proteins in the store + """ @property @abstractmethod @@ -17,31 +23,36 @@ def sequenceProp(self) -> str: """Get the name of the property that contains all protein sequences.""" @abstractmethod - def add_protein(self, protein: StoredProtein, raise_on_existing=True): - """ - Add a protein to the store. + def add_protein(self, protein: StoredProtein, raise_on_existing=True) -> StoredProtein: + """Add a protein to the store. + + Args: + protein (StoredProtein): protein sequence + raise_on_existing (bool): + raise an exception if the protein already exists in the store - :param protein: protein sequence - :param raise_on_existing: raise an exception if the protein already exists in the store - :return: `StoredProtein` instance of the added protein + Returns: + StoredProtein: instance of the added protein """ @property @abstractmethod def proteins(self) -> Iterable[StoredProtein]: - """ - Get all proteins in the store. - - :return: iterable of `Protein` instances + """Get all proteins in the store. + + Returns: + Iterable[StoredProtein]: iterable of `Protein` instances """ @abstractmethod def getProtein(self, protein_id: str) -> StoredProtein: - """ - Get a protein from the store using its name. + """Get a protein from the store using its name. - :param protein_id: name of the protein to search - :return: instance of `Protein` + Args: + protein_id (str): name of the protein to search + + Returns: + StoredProtein: instance of `Protein` """ @abstractmethod @@ -57,56 +68,72 @@ def getPCMInfo(self) -> tuple[dict[str, str], dict]: class DockableStore(ChemStore, ABC): + """Storage for dockable molecules. + + Attributes: + proteins (Iterable[StoredProtein]): all proteins in the store + """ @abstractmethod def add_target(self, target: StoredProtein, raise_on_existing=True) -> StoredProtein: - """ - Add a target to the store. + """Add a target to the store. + + Args: + target (StoredProtein): target protein + raise_on_existing (bool): + raise an exception if the target already exists in the store - :param target: target protein - :param raise_on_existing: raise an exception if the target already exists in the store - :return: `Protein` instance of the added target + Returns: + (StoredProtein): instance of the added target """ @abstractmethod def add_poses(self, mol_id: str, poses: Chem.Mol, target: StoredProtein, metadata: list[dict[str, Any]] | None = None) -> list[StoredMol]: - """ - Add poses to the store. - - :param mol_id: identifier of the molecule to add poses for - :param poses: dictionary of target identifiers and poses - :param target: target protein - :param metadata: additional metadata to store with the poses - :return: Added poses represented as `StoredMol` + """Add poses to the store. + + Args: + mol_id (str): identifier of the molecule to add poses for + poses (Chem.Mol): dictionary of target identifiers and poses + target (StoredProtein): target protein + metadata (list[dict[str, Any]]): additional metadata to store with the poses + + Returns: + list[StoredMol]: Added poses represented as `StoredMol` """ @abstractmethod def get_poses(self, mol_id: str, target_id: str) -> list[StoredMol]: - """ - Get poses from the store. - - :param mol_id: identifier of the molecule to get poses for - :param target_id: identifier of the target to get poses for - :return: + """Get poses from the store. + + Args: + mol_id (str): identifier of the molecule to get poses for + target_id (str): identifier of the target to get poses for + + Returns: + list[StoredMol]: poses for the molecule and target """ @abstractmethod def get_complex_for_pose(self, mol_id: str, target_id: str) -> Chem.Mol: - """ - Get the complex for a pose. - - :param mol_id: identifier of the molecule to get the complex for - :param target_id: identifier of the target to get the complex for - :return: tuple of the complex and the target + """Get the complex for a pose. + + Args: + mol_id (str): identifier of the molecule to get the complex for + target_id (str): identifier of the target to get the complex for + + Returns: + tuple: complex and the target """ @abstractmethod def get_target(self, target_id: str) -> StoredProtein: - """ - Get a target from the store using its ID. - - :param target_id: identifier of the target to search - :return: instance of `Protein` + """Get a target from the store using its ID. + + Args: + target_id (str): identifier of the target to search + + Returns: + StoredProtein: instance of `Protein` """ diff --git a/qsprpred/extra/data/storage/protein/interfaces/storedprotein.py b/qsprpred/extra/data/storage/protein/interfaces/storedprotein.py index e14b3d2b..9c8cea15 100644 --- a/qsprpred/extra/data/storage/protein/interfaces/storedprotein.py +++ b/qsprpred/extra/data/storage/protein/interfaces/storedprotein.py @@ -5,34 +5,45 @@ class StoredProtein(ABC): - """ - A protein object + """A protein object. + + Attributes: + id (str): id of the protein + sequence (str): sequence of the protein + props (dict[str, Any]): properties of the protein + representations (Iterable[StoredProtein]): representations of the protein """ @property @abstractmethod def id(self) -> str: + """Get the id of the protein.""" pass @property @abstractmethod def sequence(self) -> str | None: + """Get the sequence of the protein.""" pass @property @abstractmethod def props(self) -> dict[str, Any] | None: + """Get the properties of the protein.""" pass @abstractmethod def as_pdb(self) -> str | None: + """Return the protein as a PDB file.""" pass @abstractmethod def as_fasta(self) -> str | None: + """Return the protein as a FASTA file.""" pass def as_rd_mol(self) -> Chem.Mol | None: + """Return the protein as an RDKit molecule.""" pdb = self.as_pdb() if pdb is not None: return Chem.MolFromPDBBlock(self.as_pdb()) @@ -40,4 +51,5 @@ def as_rd_mol(self) -> Chem.Mol | None: @property @abstractmethod def representations(self) -> Iterable["StoredProtein"]: + """Get all representations of the protein.""" pass diff --git a/qsprpred/extra/data/storage/protein/tabular_pcm.py b/qsprpred/extra/data/storage/protein/tabular_pcm.py index e32f7448..a83d71d5 100644 --- a/qsprpred/extra/data/storage/protein/tabular_pcm.py +++ b/qsprpred/extra/data/storage/protein/tabular_pcm.py @@ -12,6 +12,14 @@ class TabularProtein(StoredProtein): + """A protein object that is stored in a tabular format. + + Attributes: + id (str): id of the protein + sequence (str): sequence of the protein + props (dict[str, Any]): properties of the protein + representations (Iterable[TabularProtein]): representations of the protein + """ def __init__( self, @@ -21,12 +29,14 @@ def __init__( props: dict[str, Any] | None = None, representations: Iterable["TabularProtein"] | None = None, ) -> None: - """ - Create a new protein instance. - - :param parent: parent protein - :param protein_id: identifier of the protein - :param sequence: sequence of the protein + """Create a new protein instance. + + Args: + protein_id (str): identifier of the protein + sequence (str): sequence of the protein + parent (TabularProtein): parent protein + props (dict[str, Any]): properties of the protein + representations (Iterable[TabularProtein]): representations of the protein """ self._parent = parent self._id = protein_id @@ -36,28 +46,42 @@ def __init__( @property def id(self) -> str: + """Get the id of the protein.""" return self._id @property def sequence(self) -> str | None: + """Get the sequence of the protein.""" return self._sequence @property def props(self) -> dict[str, Any] | None: + """Get the properties of the protein.""" return self._props def as_pdb(self) -> str | None: + """Return the protein as a PDB file.""" return self._props["pdb"] if "pdb" in self._props else None def as_fasta(self) -> str | None: + """Return the protein as a FASTA file.""" return self._props["fasta"] if "fasta" in self._props else None @property def representations(self) -> Iterable["TabularProtein"]: + """Get all representations of the protein.""" return self._representations class TabularProteinStorage(ProteinStorage, PandasDataTable): + """A storage class for proteins stored in a tabular format. + + Attributes: + sequenceCol (str): name of the column that contains all protein sequences + proteinSeqProvider (Callable): function that provides protein + sequenceProp (str): name of the property that contains all protein sequences + proteins (Iterable[TabularProtein]): all proteins in the store + """ def __init__( self, @@ -75,6 +99,23 @@ def __init__( store_format: str = "pkl", parallel_generator: ParallelGenerator | None = None, ): + """Create a new protein storage instance. + + Args: + name (str): name of the storage + df (pd.DataFrame): data frame containing the proteins + sequence_col (str): name of the column that contains all protein sequences + sequence_provider (Callable): function that provides protein + store_dir (str): directory to store the data + overwrite (bool): overwrite the existing data + index_cols (list[str]): columns to use as index + n_jobs (int): number of parallel jobs + chunk_size (int): size of the chunks + protein_col (str): name of the column that contains the protein ids + random_state (int): random state + store_format (str): format to store the data + parallel_generator (ParallelGenerator): parallel generator + """ super().__init__( name, df if df is not None else pd.DataFrame(columns=[sequence_col, @@ -158,9 +199,17 @@ def getPCMInfo(self) -> tuple[dict[str, str], dict]: @property def sequenceProp(self) -> str: + """Get the name of the property that contains all protein sequences.""" return self._sequenceCol def add_protein(self, protein: TabularProtein, raise_on_existing=True): + """Add a protein to the store. + + Args: + protein (TabularProtein): protein sequence + raise_on_existing (bool): + raise an exception if the protein already exists in the store + """ self.addEntries( [protein.id], {prop: [val] for prop, val in protein.props}, @@ -168,6 +217,14 @@ def add_protein(self, protein: TabularProtein, raise_on_existing=True): ) def _make_proteins_from_chunk(self, df: pd.DataFrame) -> list[TabularProtein]: + """Create a list of proteins from a chunk of the data frame. + + Args: + df (pd.DataFrame): chunk of the data frame + + Returns: + list[TabularProtein]: list of proteins + """ ids = df[self.idProp].values sequences = df[self.sequenceProp].values props = df.columns.difference([self.idProp, self.sequenceProp]) @@ -182,12 +239,28 @@ def _make_proteins_from_chunk(self, df: pd.DataFrame) -> list[TabularProtein]: @property def proteins(self) -> list[TabularProtein]: + """Get all proteins in the store. + + Returns: + list[TabularProtein]: list of proteins + """ ret = [] for chunk in self.iterChunks(len(self)): ret.extend(self._make_proteins_from_chunk(chunk)) return ret def getProtein(self, protein_id: str) -> TabularProtein: + """Get a protein from the store using its name. + + Args: + protein_id (str): name of the protein to search + + Returns: + TabularProtein: instance of `Protein` + + Raises: + ValueError: if the protein is not found + """ df = self.getDF() protein = df[df[self.idProp] == protein_id] if protein.empty: