From 1115807d868d2133ed54d93eafac9c0d3a4f0a33 Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Fri, 15 Mar 2024 14:15:30 +0100 Subject: [PATCH 1/2] introduce the `prepMols` method to allow customized prep of molecules for descriptors and improve documentation --- qsprpred/data/descriptors/fingerprints.py | 29 +--- qsprpred/data/descriptors/sets.py | 16 +- tutorials/advanced/data/parallelization.ipynb | 155 ++++-------------- 3 files changed, 53 insertions(+), 147 deletions(-) diff --git a/qsprpred/data/descriptors/fingerprints.py b/qsprpred/data/descriptors/fingerprints.py index 79724843..c355df38 100644 --- a/qsprpred/data/descriptors/fingerprints.py +++ b/qsprpred/data/descriptors/fingerprints.py @@ -46,12 +46,20 @@ def isFP(self): def dtype(self): return bool + def prepMols(self, mols: list[str | Mol]) -> list[Mol]: + return [Chem.AddHs(mol) for mol in self.iterMols(mols)] + def __call__( self, mols: list[str | Mol], props: dict[str, list[Any]], *args, **kwargs ) -> pd.DataFrame: """Calculate binary fingerprints for the input molecules. Only the bits specified by `usedBits` will be returned if more bits are calculated. + Before calculating the fingerprints, the molecules are + prepared by adding hydrogens (see `Fingerprint.prepMols`). + If this is undesirable, the user can prepare the molecules + themselves and call `Fingerprint.getDescriptors` directly. + Args: mols(list): list of SMILES or RDKit molecules props(dict): dictionary of properties @@ -61,9 +69,7 @@ def __call__( Returns: data frame of descriptor values of shape (n_mols, n_descriptors) """ - mols = list(self.iterMols(mols, to_list=True)) - mols = [Chem.AddHs(mol) for mol in self.iterMols(mols)] - values = self.getDescriptors(mols, props, *args, **kwargs) + values = self.getDescriptors(self.prepMols(mols), props, *args, **kwargs) values = values[:, self.usedBits] values = values.astype(self.dtype) df = pd.DataFrame(values, index=props[self.idProp]) @@ -83,15 +89,6 @@ def __init__(self, radius=2, nBits=2048, **kwargs): def getDescriptors( self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs ) -> np.ndarray: - """Return the Morgan fingerprints for the input molecules. - - Args: - mols: molecules to obtain the fingerprint of - props: dictionary of properties - - Returns: - array: `np.ndarray` of fingerprints for "mols", shape (n_mols, n_bits) - """ convertFP = DataStructs.ConvertToNumpyArray ret = np.zeros((len(mols), len(self))) for idx, mol in enumerate(mols): @@ -116,14 +113,6 @@ class RDKitMACCSFP(Fingerprint): def getDescriptors( self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs ) -> np.ndarray: - """Return the MACCS fingerprints for the input molecules. - - Args: - mols: molecules to obtain the fingerprint of - - Returns: - fingerprint (list): `list` of fingerprints for "mols" - """ convertFP = DataStructs.ConvertToNumpyArray ret = np.zeros((len(mols), len(self))) diff --git a/qsprpred/data/descriptors/sets.py b/qsprpred/data/descriptors/sets.py index 63eafbad..8bd037b4 100644 --- a/qsprpred/data/descriptors/sets.py +++ b/qsprpred/data/descriptors/sets.py @@ -66,6 +66,10 @@ def iterMols( ret = list(ret) return ret + def prepMols(self, mols: list[str | Mol]) -> list[Mol]: + """Prepare the molecules for descriptor calculation.""" + return self.iterMols(mols, to_list=True) + def __len__(self): """Return the number of descriptors currently calculated by this instance.""" return len(self.descriptors) @@ -119,6 +123,9 @@ def __call__( to the dtype specified by `self.dtype`. Infinite values are replaced by NaNs using the `treatInfs` method. + The molecules are prepared first by calling the `DescriptorSet.prepMols` method. + If you call `DescriptorSet.getDescriptors` directly, you can skip this step. + Args: mols(list): list of SMILES or RDKit molecules props(dict): dictionary of properties for the passed molecules @@ -128,8 +135,7 @@ def __call__( Returns: data frame of descriptor values of shape (n_mols, n_descriptors) """ - mols = self.iterMols(mols, to_list=True) - values = self.getDescriptors(mols, props, *args, **kwargs) + values = self.getDescriptors(self.prepMols(mols), props, *args, **kwargs) df = pd.DataFrame(values, index=props[self.idProp]) df.columns = self.descriptors try: @@ -151,7 +157,11 @@ def __call__( def getDescriptors( self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs ) -> np.ndarray: - """Main method to calculate descriptors for a list of molecules. + """Method to calculate descriptors for a list of molecules. + + This method should use molecules as they are without any preparation. + Any preparation steps should be defined in the `DescriptorSet.prepMols` method., + which is picked up by the main `DescriptorSet.__call__`. Args: mols(list): list of SMILES or RDKit molecules diff --git a/tutorials/advanced/data/parallelization.ipynb b/tutorials/advanced/data/parallelization.ipynb index 29aa965e..1d10eb5d 100644 --- a/tutorials/advanced/data/parallelization.ipynb +++ b/tutorials/advanced/data/parallelization.ipynb @@ -4,10 +4,7 @@ "cell_type": "markdown", "id": "9fedcee856268b35", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "# Code Parallelization\n", @@ -77,10 +74,7 @@ "cell_type": "markdown", "id": "8f9ffda3a4b8202f", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "## Setting `nJobs` and `chunkSize`\n", @@ -117,10 +111,7 @@ "end_time": "2024-01-16T16:30:51.361058064Z", "start_time": "2024-01-16T16:30:47.131517756Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -161,10 +152,7 @@ "cell_type": "markdown", "id": "9357f12c0516b989", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "This calculation is done on one CPU by default:" @@ -179,10 +167,7 @@ "end_time": "2024-01-16T16:30:51.368391209Z", "start_time": "2024-01-16T16:30:51.363595085Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -204,10 +189,7 @@ "cell_type": "markdown", "id": "e7e51a9829413df0", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "and the whole data set supplied as one chunk:" @@ -222,10 +204,7 @@ "end_time": "2024-01-16T16:30:51.372183338Z", "start_time": "2024-01-16T16:30:51.367032511Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -247,10 +226,7 @@ "cell_type": "markdown", "id": "d28c75dc19273bed", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can now try running this calculation in parallel on 2 CPUs:" @@ -265,10 +241,7 @@ "end_time": "2024-01-16T16:30:51.379969255Z", "start_time": "2024-01-16T16:30:51.375227876Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [], "source": [ @@ -279,10 +252,7 @@ "cell_type": "markdown", "id": "6bc6ee9045cc5f12", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "The chunk size will automatically be adjusted to 25% of the data set size so that each portion of the data set is processed on a separate CPU:" @@ -297,10 +267,7 @@ "end_time": "2024-01-16T16:30:51.411732902Z", "start_time": "2024-01-16T16:30:51.378238063Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -322,10 +289,7 @@ "cell_type": "markdown", "id": "2e21998b62ee78bf", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can see how this affects the time taken to run the calculation:" @@ -340,10 +304,7 @@ "end_time": "2024-01-16T16:30:53.084658845Z", "start_time": "2024-01-16T16:30:51.383586975Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -364,10 +325,7 @@ "cell_type": "markdown", "id": "bc5243c149010a23", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "This was faster, but not by a factor of 4. This is because there is some overhead associated with parallelization and the calculation of fingerprints is very fast by itself so the overhead affects our runtime more. In such cases, be careful about setting the chunk size manually:" @@ -382,10 +340,7 @@ "end_time": "2024-01-16T16:31:10.073558913Z", "start_time": "2024-01-16T16:30:53.083216365Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -407,10 +362,7 @@ "cell_type": "markdown", "id": "c9fdc32aa83072e6", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "This was slower than even the single CPU calculation!" @@ -420,10 +372,7 @@ "cell_type": "markdown", "id": "7c2367dd655da9c8", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "## Custom Operations\n", @@ -440,10 +389,7 @@ "end_time": "2024-01-16T16:31:10.082418114Z", "start_time": "2024-01-16T16:31:10.077838705Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -474,10 +420,7 @@ "cell_type": "markdown", "id": "3ada92396624b990", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "As you can see, this gives us a generator object. In order to run the function on each chunk and get the results, we need to iterate over the generator and collect results:" @@ -492,10 +435,7 @@ "end_time": "2024-01-16T16:31:10.175831497Z", "start_time": "2024-01-16T16:31:10.081098696Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -539,10 +479,7 @@ "cell_type": "markdown", "id": "a5f2d451e08ec155", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "The results in this case are just four `None` values since our function doesn't return anything:" @@ -557,10 +494,7 @@ "end_time": "2024-01-16T16:31:10.223479222Z", "start_time": "2024-01-16T16:31:10.180906772Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -582,10 +516,7 @@ "cell_type": "markdown", "id": "84a590acb0626ee9", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can also instruct the `apply` method to pass a `DataFrame` instead of a dictionary of properties to the function. This is useful if you want to use the `pandas.DataFrame` API to process the data:" @@ -600,10 +531,7 @@ "end_time": "2024-01-16T16:31:10.254595551Z", "start_time": "2024-01-16T16:31:10.227714969Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -633,10 +561,7 @@ "cell_type": "markdown", "id": "a14646b3cc04daee", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "**WARNING:** The `apply` method does not guarantee that the results will be returned in the same order as the chunks were processed. This is because the chunks are processed in parallel and the order depends on the order in which the parallel processes finish." @@ -646,10 +571,7 @@ "cell_type": "markdown", "id": "39fcfa580de331", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "### Molecule Processors\n", @@ -666,10 +588,7 @@ "end_time": "2024-01-16T16:31:10.307074944Z", "start_time": "2024-01-16T16:31:10.228216373Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -734,10 +653,7 @@ "cell_type": "markdown", "id": "d4a679c7ec23c64a", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "With `processMols`, we can also automatically convert the molecules to RDKit molecules before passing them to the processor:" @@ -752,10 +668,7 @@ "end_time": "2024-01-16T16:31:10.955175012Z", "start_time": "2024-01-16T16:31:10.278782050Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -803,10 +716,7 @@ "cell_type": "markdown", "id": "4927b7b9fe7bfa4c", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "You can also derive from `MolProcessorWithID` if you want to access the molecule IDs provided by the data set in your processor. This is useful to overcome the issue that the order in which chunks are processed is not guaranteed:" @@ -821,10 +731,7 @@ "end_time": "2024-01-16T16:31:12.843689806Z", "start_time": "2024-01-16T16:31:10.956455648Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { From 935cf1a032cc87b71410e2f200993f9e5e10836a Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Fri, 15 Mar 2024 14:23:51 +0100 Subject: [PATCH 2/2] update CHANGELOG.md --- CHANGELOG.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28e8aca1..05c9f174 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,8 @@ From v3.0.2 to v3.0.3 ## Fixes -- Fixed a bug where an attached standardizer would be refit when calling -`QSPRModel.predictMols` with `use_applicability_domain=True`. +- Fixed a bug where an attached standardizer would be refit when calling + `QSPRModel.predictMols` with `use_applicability_domain=True`. - Fixed random seed not set in `FoldsFromDataSplit.iterFolds` for `ClusterSplit`. ## Changes @@ -14,7 +14,8 @@ None. ## New Features -None. +- Added the `prepMols` method to `DescriptorSet` to allow separated customization of + molecule preparation before descriptor calculation. ## Removed Features