Merge branch 'refactor/prep_before_desc_calc' into 'dev'

Introduce `prepMols` and improve docs Closes #87 See merge request cdd/QSPRpred!182
CDDLeiden · Mar 19, 2024 · 7f63e85 · 7f63e85
2 parents d703692 + 935cf1a
commit 7f63e85
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 150 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,8 +4,8 @@ From v3.0.2 to v3.0.3
 
 ## Fixes
 
-- Fixed a bug where an attached standardizer would be refit when calling 
-`QSPRModel.predictMols` with `use_applicability_domain=True`.
+- Fixed a bug where an attached standardizer would be refit when calling
+  `QSPRModel.predictMols` with `use_applicability_domain=True`.
 - Fixed random seed not set in `FoldsFromDataSplit.iterFolds` for `ClusterSplit`.
 
 ## Changes
@@ -14,7 +14,8 @@ None.
 
 ## New Features
 
-None.
+- Added the `prepMols` method to `DescriptorSet` to allow separated customization of
+  molecule preparation before descriptor calculation.
 
 ## Removed Features
 

diff --git a/qsprpred/data/descriptors/fingerprints.py b/qsprpred/data/descriptors/fingerprints.py
@@ -46,12 +46,20 @@ def isFP(self):
     def dtype(self):
         return bool
 
+    def prepMols(self, mols: list[str | Mol]) -> list[Mol]:
+        return [Chem.AddHs(mol) for mol in self.iterMols(mols)]
+
     def __call__(
         self, mols: list[str | Mol], props: dict[str, list[Any]], *args, **kwargs
     ) -> pd.DataFrame:
         """Calculate binary fingerprints for the input molecules. Only the bits
         specified by `usedBits` will be returned if more bits are calculated.
 
+        Before calculating the fingerprints, the molecules are
+        prepared by adding hydrogens (see `Fingerprint.prepMols`).
+        If this is undesirable, the user can prepare the molecules
+        themselves and call `Fingerprint.getDescriptors` directly.
+
         Args:
             mols(list): list of SMILES or RDKit molecules
             props(dict): dictionary of properties
@@ -61,9 +69,7 @@ def __call__(
         Returns:
             data frame of descriptor values of shape (n_mols, n_descriptors)
         """
-        mols = list(self.iterMols(mols, to_list=True))
-        mols = [Chem.AddHs(mol) for mol in self.iterMols(mols)]
-        values = self.getDescriptors(mols, props, *args, **kwargs)
+        values = self.getDescriptors(self.prepMols(mols), props, *args, **kwargs)
         values = values[:, self.usedBits]
         values = values.astype(self.dtype)
         df = pd.DataFrame(values, index=props[self.idProp])
@@ -83,15 +89,6 @@ def __init__(self, radius=2, nBits=2048, **kwargs):
     def getDescriptors(
         self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
     ) -> np.ndarray:
-        """Return the Morgan fingerprints for the input molecules.
-
-        Args:
-            mols: molecules to obtain the fingerprint of
-            props: dictionary of properties
-
-        Returns:
-            array: `np.ndarray` of fingerprints for "mols", shape (n_mols, n_bits)
-        """
         convertFP = DataStructs.ConvertToNumpyArray
         ret = np.zeros((len(mols), len(self)))
         for idx, mol in enumerate(mols):
@@ -116,14 +113,6 @@ class RDKitMACCSFP(Fingerprint):
     def getDescriptors(
         self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
     ) -> np.ndarray:
-        """Return the MACCS fingerprints for the input molecules.
-
-        Args:
-            mols: molecules to obtain the fingerprint of
-
-        Returns:
-            fingerprint (list): `list` of fingerprints for "mols"
-        """
         convertFP = DataStructs.ConvertToNumpyArray
 
         ret = np.zeros((len(mols), len(self)))

diff --git a/qsprpred/data/descriptors/sets.py b/qsprpred/data/descriptors/sets.py
@@ -66,6 +66,10 @@ def iterMols(
             ret = list(ret)
         return ret
 
+    def prepMols(self, mols: list[str | Mol]) -> list[Mol]:
+        """Prepare the molecules for descriptor calculation."""
+        return self.iterMols(mols, to_list=True)
+
     def __len__(self):
         """Return the number of descriptors currently calculated by this instance."""
         return len(self.descriptors)
@@ -119,6 +123,9 @@ def __call__(
         to the dtype specified by `self.dtype`. Infinite values are replaced by NaNs
         using the `treatInfs` method.
 
+        The molecules are prepared first by calling the `DescriptorSet.prepMols` method.
+        If you call `DescriptorSet.getDescriptors` directly, you can skip this step.
+
         Args:
             mols(list): list of SMILES or RDKit molecules
             props(dict): dictionary of properties for the passed molecules
@@ -128,8 +135,7 @@ def __call__(
         Returns:
             data frame of descriptor values of shape (n_mols, n_descriptors)
         """
-        mols = self.iterMols(mols, to_list=True)
-        values = self.getDescriptors(mols, props, *args, **kwargs)
+        values = self.getDescriptors(self.prepMols(mols), props, *args, **kwargs)
         df = pd.DataFrame(values, index=props[self.idProp])
         df.columns = self.descriptors
         try:
@@ -151,7 +157,11 @@ def __call__(
     def getDescriptors(
         self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
     ) -> np.ndarray:
-        """Main method to calculate descriptors for a list of molecules.
+        """Method to calculate descriptors for a list of molecules.
+
+        This method should use molecules as they are without any preparation.
+        Any preparation steps should be defined in the `DescriptorSet.prepMols` method.,
+        which is picked up by the main `DescriptorSet.__call__`.
 
         Args:
             mols(list): list of SMILES or RDKit molecules