-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e038144
commit ee9abbe
Showing
5 changed files
with
1,338 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,271 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "5aa74260", | ||
"metadata": {}, | ||
"source": [ | ||
"# Feature Resolution and Residuals Experiment (WIP)\n", | ||
"\n", | ||
"This notebook uses the SageWorks Framework to quickly build an AWS® Machine Learning Pipeline with the AQSolDB public dataset. For this exercise we're going to look at the relationship between feature space and target values, specifically we're going to use SageWorks to help us identify areas where compounds that are close in feature space have significant differences in their target values (solubility in this case).\n", | ||
"\n", | ||
"\n", | ||
"## Data\n", | ||
"AqSolDB: A curated reference set of aqueous solubility, created by the Autonomous Energy Materials Discovery [AMD] research group, consists of aqueous solubility values of 9,982 unique compounds curated from 9 different publicly available aqueous solubility datasets. AqSolDB also contains some relevant topological and physico-chemical 2D descriptors. Additionally, AqSolDB contains validated molecular representations of each of the compounds. This openly accessible dataset, which is the largest of its kind, and will not only serve as a useful reference source of measured and calculated solubility data, but also as a much improved and generalizable training data source for building data-driven models. (2019-04-10)\n", | ||
"\n", | ||
"Main Reference:\n", | ||
"https://www.nature.com/articles/s41597-019-0151-1\n", | ||
"\n", | ||
"Data Dowloaded from the Harvard DataVerse:\n", | ||
"https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OVHAW8\n", | ||
"\n", | ||
"® Amazon Web Services, AWS, the Powered by AWS logo, are trademarks of Amazon.com, Inc. or its affiliates." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "77f0186f-c6ac-4dc7-a804-ce5d248d25b7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import sageworks\n", | ||
"import logging\n", | ||
"logging.getLogger(\"sageworks\").setLevel(logging.WARNING)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a7ae1c21", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# We've already created a FeatureSet so just grab it a sample\n", | ||
"from sageworks.api import FeatureSet\n", | ||
"fs = FeatureSet(\"test_sol_nightly_log_s\")\n", | ||
"full_df = fs.pull_dataframe()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "97243583", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"full_df.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "bb268a27-6f18-432e-8eb5-43316f6174cf", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Sanity check our solubility and solubility_class\n", | ||
"import pandas as pd\n", | ||
"import seaborn as sns\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"plt.rcParams['font.size'] = 12.0\n", | ||
"plt.rcParams['figure.figsize'] = 14.0, 5.0\n", | ||
"sns.set_theme(style='darkgrid')\n", | ||
"\n", | ||
"# Create a box plot\n", | ||
"ax = sns.boxplot(x='class', y='log_s', data=full_df, order = [2, 1, 0])\n", | ||
"plt.title('Solubility by Solubility Class')\n", | ||
"plt.xlabel('Solubility Class')\n", | ||
"plt.ylabel('Solubility')\n", | ||
"plt.show()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "fb742921-cfc1-4722-af84-a86ac0f86d5f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"Chem.MolFromSmiles(\"CCC(=O)OC(CC1=CC=CC=C1)(C(C)CN(C)C)C2=CC=CC=C2\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "fdb44d45-57a6-4461-8a84-5d30ae6a9c9f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"Chem.MolFromSmiles(\"CCC(=O)O[C@@](Cc1ccccc1)([C@H](C)CN(C)C)c2ccccc2\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "96a71382-69c4-4804-9c4f-912acaeea4d9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def show(id):\n", | ||
" smile = df[df[\"id\"]==id][\"smiles\"].values[0]\n", | ||
" print(smile)\n", | ||
" _features = df[df[\"id\"]==id][features].values[0]\n", | ||
" print(_features)\n", | ||
" _target = df[df[\"id\"]==id][target].values[0]\n", | ||
" print(_target)\n", | ||
" return Chem.MolFromSmiles(smile)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a8949d09-672e-4cc9-a05e-f60e242cd122", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"close_ids = [\"E-1200\", \"A-3473\", \"B-1665\"]\n", | ||
"show(\"E-1200\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ee463c10-d1e8-4954-aebf-7e14000676c0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"show(\"A-3473\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "7aedb2fa-924a-42ad-9970-f0d53b2dd250", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"show(\"B-1665\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a4ed1d10-8960-4003-99b9-c594bdb41500", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from rdkit import Chem\n", | ||
"\n", | ||
"# Create an RDKit molecule from a SMILES string\n", | ||
"smiles = \"CCC(=O)OC(CC1=CC=CC=C1)(C(C)CN(C)C)C2=CC=CC=C2\"\n", | ||
"mol = Chem.MolFromSmiles(smiles)\n", | ||
"\n", | ||
"# Assign stereochemistry using RDKit\n", | ||
"Chem.AssignStereochemistry(mol, cleanIt=True, force=True)\n", | ||
"\n", | ||
"# Find chiral centers and their configurations\n", | ||
"chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)\n", | ||
"\n", | ||
"# Print the results\n", | ||
"for center in chiral_centers:\n", | ||
" index, configuration = center\n", | ||
" print(f\"Atom index: {index}, Configuration: {configuration}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d804d78c-0800-433b-8c69-40a799f463dc", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create an RDKit molecule from a SMILES string\n", | ||
"smiles = \"CCC(=O)O[C@@](Cc1ccccc1)([C@H](C)CN(C)C)c2ccccc2\"\n", | ||
"mol = Chem.MolFromSmiles(smiles)\n", | ||
"\n", | ||
"# Assign stereochemistry using RDKit\n", | ||
"Chem.AssignStereochemistry(mol, cleanIt=True, force=True)\n", | ||
"\n", | ||
"# Find chiral centers and their configurations\n", | ||
"chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)\n", | ||
"\n", | ||
"# Print the results\n", | ||
"for center in chiral_centers:\n", | ||
" index, configuration = center\n", | ||
" print(f\"Atom index: {index}, Configuration: {configuration}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "f31162c1", | ||
"metadata": {}, | ||
"source": [ | ||
"# Helper Methods" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "90d26e96", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Helper to look at predictions vs target\n", | ||
"from math import sqrt\n", | ||
"import pandas as pd\n", | ||
"import seaborn as sns\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"plt.rcParams['font.size'] = 12.0\n", | ||
"plt.rcParams['figure.figsize'] = 14.0, 5.0\n", | ||
"sns.set_theme(style='darkgrid')\n", | ||
"def plot_predictions(df, line=True, color=\"PredError\"):\n", | ||
" \n", | ||
" # Dataframe of the targets and predictions\n", | ||
" target = 'Actual Solubility'\n", | ||
" pred = 'Predicted Solubility'\n", | ||
" df_plot = pd.DataFrame({target: df['log_s'], pred: df['prediction']})\n", | ||
" \n", | ||
" # Compute Error per prediction\n", | ||
" if color == \"PredError\":\n", | ||
" df_plot[\"PredError\"] = df_plot.apply(lambda x: abs(x[pred] - x[target]), axis=1)\n", | ||
" else:\n", | ||
" df_plot[color] = df[color]\n", | ||
"\n", | ||
" #df_plot['error'] = df_plot.apply(lambda x: abs(x[pred] - x[target]), axis=1)\n", | ||
" ax = df_plot.plot.scatter(x=target, y=pred, c=color, cmap='coolwarm', sharex=False)\n", | ||
" \n", | ||
" # Just a diagonal line\n", | ||
" if line:\n", | ||
" ax.axline((1, 1), slope=1, linewidth=2, c='black')\n", | ||
" x_pad = (df_plot[target].max() - df_plot[target].min())/10.0 \n", | ||
" y_pad = (df_plot[pred].max() - df_plot[pred].min())/10.0\n", | ||
" plt.xlim(df_plot[target].min()-x_pad, df_plot[target].max()+x_pad)\n", | ||
" plt.ylim(df_plot[pred].min()-y_pad, df_plot[pred].max()+y_pad)\n", | ||
"\n", | ||
" " | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.