forked from martin-sicho/papyrus-scaffold-visualizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
83 lines (70 loc) · 2.85 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
utils
Created by: Martin Sicho
On: 22.05.23, 10:12
"""
import os
from qsprpred.data.data import QSPRDataset
from qsprpred.data.sources.papyrus import Papyrus
from qsprpred.data.utils.datasplitters import scaffoldsplit
from qsprpred.data.utils.descriptorcalculator import MoleculeDescriptorsCalculator
from qsprpred.data.utils.descriptorsets import FingerprintSet
from qsprpred.data.utils.featurefilters import lowVarianceFilter, highCorrelationFilter
from qsprpred.data.utils.scaffolds import Murcko
from qsprpred.models.models import QSPRsklearn
def prepare_example_dataset(mol_table, target_props, force_build=False):
dataset = QSPRDataset.fromMolTable(mol_table, target_props=target_props)
if not force_build and not dataset.hasDescriptors:
feature_calculator = MoleculeDescriptorsCalculator(
descsets=[FingerprintSet(fingerprint_type="MorganFP", radius=3, nBits=2048)])
split = scaffoldsplit(dataset=dataset, scaffold=Murcko(), test_fraction=0.2) # split on Murcko scaffolds
lv = lowVarianceFilter(0.05)
hc = highCorrelationFilter(0.8)
dataset.prepareDataset(
split=split,
feature_calculators=[feature_calculator],
feature_filters=[lv, hc]
)
dataset.save()
else:
print("Data set already prepared. Preparation skipped.")
print(f"Number of samples train set: {len(dataset.y)}")
print(f"Number of samples test set: {len(dataset.y_ind)}, {len(dataset.y_ind) / len(dataset.df) * 100}%")
return dataset
def fetch_example_models(models, target_props, force_build=False):
"""
Use the example data set to build example models if they do not exist. Reload old models otherwise.
Args:
models: classes of scikit-learn models to use
target_props: target properties to use as specified in the QSPRPred package
force_build: if True, the models will be built even if they already exist
Returns:
list of fitted and evaluated models
"""
# use Papyrus to fetch the data set
acc_keys = ["P51681"]
name = "P51681_LIGANDS_nostereo"
quality = "low"
papyrus = Papyrus(data_dir="./data", stereo=False)
dataset = papyrus.getData(
acc_keys,
quality,
name=name,
use_existing=True # use existing data set if it was already compiled before
)
# train the models
fitted_models = []
for model, prop in zip(models, target_props):
dataset = prepare_example_dataset(dataset, prop, force_build=force_build)
model = QSPRsklearn(
base_dir='data',
data=dataset,
alg=model,
name=model.__name__
)
# only train if required
if force_build or not os.path.exists(model.metaFile):
model.evaluate()
model.fit()
fitted_models.append(model)
return fitted_models