diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..661a5d1 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,20 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +mkdocs: + configuration: mkdocs.yml + +# Optionally declare the Python requirements required to build your docs +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3be3520..f8d0ff7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -12,7 +12,7 @@ pool: steps: - task: UsePythonVersion@0 inputs: - versionSpec: '3.8' + versionSpec: '3.11' architecture: 'x64' - script: | @@ -23,4 +23,4 @@ steps: - script: | cd tests python test_all.py - displayName: 'unittest' + displayName: 'Unit testing' diff --git a/docs/requirements.txt b/docs/requirements.txt index b5d2878..9c81b6a 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,2 +1,2 @@ mkdocs-material -mkdocstrings \ No newline at end of file +mkdocstrings-python \ No newline at end of file diff --git a/ecnet/__init__.py b/ecnet/__init__.py index 017f38b..e2b068b 100644 --- a/ecnet/__init__.py +++ b/ecnet/__init__.py @@ -1,2 +1,2 @@ from .model import ECNet -__version__ = '4.1.0' +__version__ = '4.1.1' diff --git a/ecnet/datasets/structs.py b/ecnet/datasets/structs.py index b023871..ba7fdaf 100644 --- a/ecnet/datasets/structs.py +++ b/ecnet/datasets/structs.py @@ -2,6 +2,7 @@ from typing import List, Tuple, Iterable import torch from torch.utils.data import Dataset +from sklearn.decomposition import PCA from .utils import _qspr_from_padel, _qspr_from_alvadesc,\ _qspr_from_alvadesc_smifile @@ -21,9 +22,9 @@ def __init__(self, smiles: List[str], target_vals: Iterable[Iterable[float]], """ self.smiles = smiles - self.target_vals = torch.as_tensor(target_vals) + self.target_vals = torch.as_tensor(target_vals).type(torch.float32) self.desc_vals, self.desc_names = self.smi_to_qspr(smiles, backend) - self.desc_vals = torch.as_tensor(self.desc_vals) + self.desc_vals = torch.as_tensor(self.desc_vals).type(torch.float32) @staticmethod def smi_to_qspr(smiles: List[str], backend: str) -> Tuple[List[List[float]], List[str]]: @@ -110,17 +111,17 @@ def __init__(self, smiles_fn: str, target_vals: Iterable[Iterable[float]], """ self.smiles = self._open_smiles_file(smiles_fn) - self.target_vals = torch.as_tensor(target_vals) + self.target_vals = torch.as_tensor(target_vals).type(torch.float32) if backend == 'padel': self.desc_vals, self.desc_names = self.smi_to_qspr( self.smiles, backend ) - self.desc_vals = torch.as_tensor(self.desc_vals) + self.desc_vals = torch.as_tensor(self.desc_vals).type(torch.float32) elif backend == 'alvadesc': self.desc_vals, self.desc_names = _qspr_from_alvadesc_smifile( smiles_fn ) - self.desc_vals = torch.as_tensor(self.desc_vals) + self.desc_vals = torch.as_tensor(self.desc_vals).type(torch.float32) @staticmethod def _open_smiles_file(smiles_fn: str) -> List[str]: @@ -156,5 +157,35 @@ def __init__(self, desc_vals: Iterable[Iterable[float]], self.smiles = ['' for _ in range(len(target_vals))] self.desc_names = ['' for _ in range(len(desc_vals[0]))] - self.desc_vals = torch.as_tensor(desc_vals) - self.target_vals = torch.as_tensor(target_vals) + self.desc_vals = torch.as_tensor(desc_vals).type(torch.float32) + self.target_vals = torch.as_tensor(target_vals).type(torch.float32) + + +class PCADataset(QSPRDataset): + + def __init__(self, smiles: List[str], target_vals: Iterable[Iterable[float]], + backend: str = 'padel', existing_pca_dataset: 'PCADataset' = None): + """ + PCADataset: creates a torch.utils.data.Dataset given supplied SMILES strings, supplied + target values; first generates QSPR descriptors, then transforms them via PCA; an existing + PCADataset can be supplied to peform PCA transformation + + Args: + smiles (list[str]): SMILES strings + target_vals (Iterable[Iterable[float]]): target values of shape (n_samples, n_targets) + backend (str, optional): backend for QSPR generation, ['padel', 'alvadesc'] + existing_pca_dataset (PCADataset, optional): if PCA already trained (e.g. trained + using training set, want to use for testing set), the pre-trained PCA can be used + to perform PCA for this data + """ + + self.smiles = smiles + self.target_vals = torch.as_tensor(target_vals).type(torch.float32) + self.desc_names = None + desc_vals, _ = self.smi_to_qspr(smiles, backend) + if existing_pca_dataset is None: + self.pca = PCA(n_components=min(desc_vals.shape[0], desc_vals.shape[1])) + self.pca.fit(desc_vals) + else: + self.pca = existing_pca_dataset.pca + self.desc_vals = torch.as_tensor(self.pca.transform(desc_vals)).type(torch.float32) diff --git a/setup.py b/setup.py index f91321c..a8cdbca 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='ecnet', - version='4.1.0', + version='4.1.1', description='Fuel property prediction using QSPR descriptors', url='https://github.com/ecrl/ecnet', author='Travis Kessler', @@ -10,9 +10,9 @@ license='MIT', packages=find_packages(), install_requires=[ - 'torch==1.8.0', - 'sklearn', - 'padelpy==0.1.9', + 'torch==2.0.0', + 'scikit-learn==1.2.2', + 'padelpy==0.1.13', 'alvadescpy==0.1.2', 'ecabc==3.0.0' ],