Skip to content

Commit

Permalink
Merge branch 'refactor/prep_before_desc_calc' into 'dev'
Browse files Browse the repository at this point in the history
Introduce `prepMols` and improve docs

Closes #87

See merge request cdd/QSPRpred!182
  • Loading branch information
martin-sicho committed Mar 19, 2024
2 parents d703692 + 935cf1a commit 7f63e85
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 150 deletions.
7 changes: 4 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ From v3.0.2 to v3.0.3

## Fixes

- Fixed a bug where an attached standardizer would be refit when calling
`QSPRModel.predictMols` with `use_applicability_domain=True`.
- Fixed a bug where an attached standardizer would be refit when calling
`QSPRModel.predictMols` with `use_applicability_domain=True`.
- Fixed random seed not set in `FoldsFromDataSplit.iterFolds` for `ClusterSplit`.

## Changes
Expand All @@ -14,7 +14,8 @@ None.

## New Features

None.
- Added the `prepMols` method to `DescriptorSet` to allow separated customization of
molecule preparation before descriptor calculation.

## Removed Features

Expand Down
29 changes: 9 additions & 20 deletions qsprpred/data/descriptors/fingerprints.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,20 @@ def isFP(self):
def dtype(self):
return bool

def prepMols(self, mols: list[str | Mol]) -> list[Mol]:
return [Chem.AddHs(mol) for mol in self.iterMols(mols)]

def __call__(
self, mols: list[str | Mol], props: dict[str, list[Any]], *args, **kwargs
) -> pd.DataFrame:
"""Calculate binary fingerprints for the input molecules. Only the bits
specified by `usedBits` will be returned if more bits are calculated.
Before calculating the fingerprints, the molecules are
prepared by adding hydrogens (see `Fingerprint.prepMols`).
If this is undesirable, the user can prepare the molecules
themselves and call `Fingerprint.getDescriptors` directly.
Args:
mols(list): list of SMILES or RDKit molecules
props(dict): dictionary of properties
Expand All @@ -61,9 +69,7 @@ def __call__(
Returns:
data frame of descriptor values of shape (n_mols, n_descriptors)
"""
mols = list(self.iterMols(mols, to_list=True))
mols = [Chem.AddHs(mol) for mol in self.iterMols(mols)]
values = self.getDescriptors(mols, props, *args, **kwargs)
values = self.getDescriptors(self.prepMols(mols), props, *args, **kwargs)
values = values[:, self.usedBits]
values = values.astype(self.dtype)
df = pd.DataFrame(values, index=props[self.idProp])
Expand All @@ -83,15 +89,6 @@ def __init__(self, radius=2, nBits=2048, **kwargs):
def getDescriptors(
self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
) -> np.ndarray:
"""Return the Morgan fingerprints for the input molecules.
Args:
mols: molecules to obtain the fingerprint of
props: dictionary of properties
Returns:
array: `np.ndarray` of fingerprints for "mols", shape (n_mols, n_bits)
"""
convertFP = DataStructs.ConvertToNumpyArray
ret = np.zeros((len(mols), len(self)))
for idx, mol in enumerate(mols):
Expand All @@ -116,14 +113,6 @@ class RDKitMACCSFP(Fingerprint):
def getDescriptors(
self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
) -> np.ndarray:
"""Return the MACCS fingerprints for the input molecules.
Args:
mols: molecules to obtain the fingerprint of
Returns:
fingerprint (list): `list` of fingerprints for "mols"
"""
convertFP = DataStructs.ConvertToNumpyArray

ret = np.zeros((len(mols), len(self)))
Expand Down
16 changes: 13 additions & 3 deletions qsprpred/data/descriptors/sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ def iterMols(
ret = list(ret)
return ret

def prepMols(self, mols: list[str | Mol]) -> list[Mol]:
"""Prepare the molecules for descriptor calculation."""
return self.iterMols(mols, to_list=True)

def __len__(self):
"""Return the number of descriptors currently calculated by this instance."""
return len(self.descriptors)
Expand Down Expand Up @@ -119,6 +123,9 @@ def __call__(
to the dtype specified by `self.dtype`. Infinite values are replaced by NaNs
using the `treatInfs` method.
The molecules are prepared first by calling the `DescriptorSet.prepMols` method.
If you call `DescriptorSet.getDescriptors` directly, you can skip this step.
Args:
mols(list): list of SMILES or RDKit molecules
props(dict): dictionary of properties for the passed molecules
Expand All @@ -128,8 +135,7 @@ def __call__(
Returns:
data frame of descriptor values of shape (n_mols, n_descriptors)
"""
mols = self.iterMols(mols, to_list=True)
values = self.getDescriptors(mols, props, *args, **kwargs)
values = self.getDescriptors(self.prepMols(mols), props, *args, **kwargs)
df = pd.DataFrame(values, index=props[self.idProp])
df.columns = self.descriptors
try:
Expand All @@ -151,7 +157,11 @@ def __call__(
def getDescriptors(
self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
) -> np.ndarray:
"""Main method to calculate descriptors for a list of molecules.
"""Method to calculate descriptors for a list of molecules.
This method should use molecules as they are without any preparation.
Any preparation steps should be defined in the `DescriptorSet.prepMols` method.,
which is picked up by the main `DescriptorSet.__call__`.
Args:
mols(list): list of SMILES or RDKit molecules
Expand Down
Loading

0 comments on commit 7f63e85

Please sign in to comment.