diff --git a/experiments/sol_pipeline.py b/experiments/sol_pipeline.py new file mode 100644 index 000000000..dc9c78ad4 --- /dev/null +++ b/experiments/sol_pipeline.py @@ -0,0 +1,120 @@ +"""Create Solubility Models in AWS/SageWorks + +We're using Pipelines to create different set of ML Artifacts in SageWorks +""" + +import pandas as pd +import numpy as np +import logging + +from sageworks.api.data_source import DataSource +from sageworks.api.feature_set import FeatureSet +from sageworks.api.model import Model, ModelType +from sageworks.api.endpoint import Endpoint + +from sageworks.core.transforms.data_to_features.light.molecular_descriptors import MolecularDescriptors +from sageworks.aws_service_broker.aws_service_broker import AWSServiceBroker +from sageworks.api.pipeline import Pipeline +from sageworks.utils.pandas_utils import stratified_split + +log = logging.getLogger("sageworks") + +# Set our pipeline +pipeline_name = "test_solubility_class_nightly_100_v0" + + +if __name__ == "__main__": + # This forces a refresh on all the data we get from the AWs Broker + AWSServiceBroker().get_all_metadata(force_refresh=True) + + # Grab all the information from the Pipeline (as a dictionary) + pipe = Pipeline(pipeline_name).pipeline + + # Get all the pipeline information + s3_path = pipe["data_source"]["input"] + model_features = pipe["model"]["feature_list"] + data_source_input = pipe["data_source"]["input"] + data_source_name = pipe["data_source"]["name"] + data_source_tags = pipe["data_source"]["tags"] + feature_set_name = pipe["feature_set"]["name"] + feature_set_tags = pipe["feature_set"]["tags"] + holdout = pipe["feature_set"]["holdout"] + model_name = pipe["model"]["name"] + model_type_str = pipe["model"]["model_type"] + model_tags = pipe["model"]["tags"] + model_target = pipe["model"]["target_column"] + endpoint_name = pipe["endpoint"]["name"] + endpoint_tags = pipe["endpoint"]["tags"] + pipeline_id = pipe["pipeline"] + + # Recreate Flag in case you want to recreate the artifacts + recreate = False + + # Create the aqsol_data DataSource + if recreate or not DataSource(data_source_name).exists(): + # Grab the input and add some columns + df = DataSource(data_source_input).pull_dataframe() + + # Remove 'weird' values + log.important("Removing 'weird' values from the solubility data") + log.important(f"Original Shape: {df.shape}") + df = df[df["udm_asy_res_value"] != 4.7] + df = df[df["udm_asy_res_value"] != 0] + log.important(f"New Shape: {df.shape}") + + # Compute the log of the solubility + df["udm_asy_res_value"] = df["udm_asy_res_value"].replace(0, 1e-10) + df["log_s"] = np.log10(df["udm_asy_res_value"] / 1e6) + + # Create a solubility classification column + bins = [-float("inf"), -5, -4, float("inf")] + labels = ["low", "medium", "high"] + df["solubility_class"] = pd.cut(df["log_s"], bins=bins, labels=labels) + + # Now we'll create the DataSource with the new column + DataSource(df, name=data_source_name, tags=data_source_tags) + + # + # Molecular Descriptor Artifacts + # + # Create the rdkit FeatureSet (this is an example of using lower level classes) + if recreate or not FeatureSet(feature_set_name).exists(): + + rdkit_features = MolecularDescriptors(data_source_name, feature_set_name) + rdkit_features.set_output_tags(feature_set_tags) + rdkit_features.transform(id_column="udm_mol_id") + + # Set the holdout ids for the FeatureSet + fs = FeatureSet(feature_set_name) + + # Hold out logic (might be a list of ids or a stratified split) + if isinstance(holdout, list): + fs.set_holdout_ids("udm_mol_id", holdout) + else: + # Stratified Split, so we need to pull the parameters from the string + test_size = float(holdout.split(":")[1]) + column_name = holdout.split(":")[2] + df = fs.pull_dataframe()[["udm_mol_id", column_name]] + + # Perform the stratified split and set the hold out ids + train, test = stratified_split(df, column_name=column_name, test_size=test_size) + fs.set_holdout_ids("udm_mol_id", test["udm_mol_id"].tolist()) + + # Create the Model + model_type = ModelType(model_type_str) + if recreate or not Model(model_name).exists(): + feature_set = FeatureSet(feature_set_name) + feature_set.to_model( + model_type, + target_column=model_target, + name=model_name, + feature_list=model_features, + tags=model_tags, + ) + + # Create the Endpoint + if recreate or not Endpoint(endpoint_name).exists(): + m = Model(model_name) + m.to_endpoint(name=endpoint_name, tags=endpoint_tags) + end = Endpoint(endpoint_name) + end.auto_inference(capture=True) diff --git a/scripts/create_glue_workflow_with_trigger.py b/glue_jobs/create_glue_workflow_with_trigger.py similarity index 100% rename from scripts/create_glue_workflow_with_trigger.py rename to glue_jobs/create_glue_workflow_with_trigger.py diff --git a/glue_jobs/example_glue_job.py b/glue_jobs/example_glue_job.py new file mode 100644 index 000000000..f31e6a312 --- /dev/null +++ b/glue_jobs/example_glue_job.py @@ -0,0 +1,104 @@ +# Example Glue Job that goes from CSV to Model/Endpoint +import sys +import numpy as np +import pandas as pd + +# SageWorks Imports +from sageworks.api.data_source import DataSource +from sageworks.api.feature_set import FeatureSet +from sageworks.api.model import Model, ModelType +from sageworks.api.endpoint import Endpoint +from sageworks.core.transforms.data_to_features.light.molecular_descriptors import ( + MolecularDescriptors, +) +from sageworks.core.transforms.pandas_transforms.pandas_to_features import ( + PandasToFeatures, +) +from sageworks.utils.config_manager import ConfigManager +from sageworks.utils.glue_utils import glue_args_to_dict + +# Convert Glue Job Args to a Dictionary +glue_args = glue_args_to_dict(sys.argv) + +# Set the SAGEWORKS_BUCKET for the ConfigManager +cm = ConfigManager() +cm.set_config("SAGEWORKS_BUCKET", glue_args["--sageworks-bucket"]) +cm.set_config("REDIS_HOST", glue_args["--redis-host"]) + +# Create a new Data Source from an S3 Path +# source_path = "s3://idb-forest-sandbox/physchemproperty/LogS/Null/gen_processed/2024_03_07_id_smiles.csv" +source_path = ( + "s3://idb-forest-sandbox/physchemproperty/assay_processed_collection/solubility/all/2024_03_07_id_smiles.csv" +) +my_data = DataSource(source_path, name="solubility_test_data") + +# Pull the dataframe from the Data Source +df = DataSource("solubility_test_data").pull_dataframe() + +# Convert to logS +# Note: This will make 0 -> -16 +df["udm_asy_res_value"] = df["udm_asy_res_value"].replace(0, 1e-10) +df["log_s"] = np.log10(df["udm_asy_res_value"] / 1e6) +df["log_s"] = df["udm_asy_res_value"] + +# Create a solubility classification column +bins = [-float("inf"), -5, -4, float("inf")] +labels = ["low", "medium", "high"] +df["sol_class"] = pd.cut(df["log_s"], bins=bins, labels=labels) + +# Compute molecular descriptors +molecular_features = MolecularDescriptors("solubility_test_data", "solubility_test_features") + +# Okay we're going to use the guts of the class without actually doing the DS to FS transformation +molecular_features.input_df = df[:100] +molecular_features.transform_impl() +output_df = molecular_features.output_df +print(output_df.head()) + +# Create a Feature Set +to_features = PandasToFeatures("solubility_test_features", auto_one_hot=False) +to_features.set_input(output_df, target_column="log_s", id_column="udm_mol_bat_id") +to_features.set_output_tags(["test", "solubility"]) +to_features.transform() + + +""" +DataSource(source_path, name="solubility_test_data") + +# Create a Feature Set +molecular_features = MolecularDescriptors("solubility_test_data", "solubility_test_features") +molecular_features.set_output_tags(["test", "solubility", "molecular_descriptors"]) +query = "SELECT udm_mol_bat_id, udm_asy_protocol, udm_prj_code, udm_asy_res_value, smiles FROM solubility_test_data" +molecular_features.transform(target_column="solubility", id_column="udm_mol_bat_id", query=query, auto_one_hot=False) +""" + +""" +# Convert to logS +# Note: This will make 0 -> -16 +test_df["udm_asy_res_value"] = test_df["udm_asy_res_value"].replace(0, 1e-10) +test_df["log_s"] = np.log10(test_df["udm_asy_res_value"] / 1e6) + +target_column = "log_s" +meta = [ + "write_time", + "api_invocation_time", + "is_deleted", + "udm_asy_protocol", + "udm_asy_cnd_format", + "std_dev", + "count", + "udm_mol_id", + "udm_asy_date", + "udm_prj_code", + "udm_asy_cnd_target", + "udm_asy_cnd_time_hr", + "smiles", + "udm_mol_bat_slt_smiles", + "udm_mol_bat_slv_smiles", + "operator", + "class", + "event_time", +] +exclude = ["log_s", "udm_asy_res_value", "udm_mol_bat_id"] + meta +feature_columns = [c for c in test_df.columns if c not in exclude] +""" diff --git a/glue_jobs/glue_job_debug.py b/glue_jobs/glue_job_debug.py new file mode 100644 index 000000000..f4023e1b0 --- /dev/null +++ b/glue_jobs/glue_job_debug.py @@ -0,0 +1,269 @@ +import sys +import os +import awswrangler as wr + +# SageWorks Imports +from sageworks.api.feature_set import FeatureSet +from sageworks.api.model import Model, ModelType +from sageworks.api.endpoint import Endpoint +from sageworks.utils.config_manager import ConfigManager +from sageworks.utils.glue_utils import glue_args_to_dict + +# Convert Glue Job Args to a Dictionary +glue_args = glue_args_to_dict(sys.argv) + +# Set the SAGEWORKS_BUCKET for the ConfigManager +cm = ConfigManager() +cm.set_config("SAGEWORKS_BUCKET", glue_args["--sageworks-bucket"]) + +# Feature Set Name (Hardcoded) +fs_name = "solubility_featurized_fs" + +# Model Name (Hardcoded) +model_name = "solubility-class-0-nightly" + +# Target and Feature List (Hardcoded) +target = ["class"] +features = [ + "estate_vsa3", + "peoe_vsa3", + "bertzct", + "maxabspartialcharge", + "smr_vsa6", + "bcut2d_chghi", + "smr_vsa2", + "nhohcount", + "fr_ether", + "slogp_vsa4", + "vsa_estate7", + "numaromaticheterocycles", + "minabsestateindex", + "kappa1", + "fr_piperdine", + "maxestateindex", + "fpdensitymorgan3", + "fr_nh2", + "fr_oxazole", + "narombond", + "smr_vsa8", + "fr_sh", + "fr_nitro", + "fr_thiophene", + "bcut2d_mwhi", + "fr_ndealkylation1", + "estate_vsa11", + "labuteasa", + "peoe_vsa10", + "fpdensitymorgan1", + "fr_oxime", + "slogp_vsa6", + "minestateindex", + "kappa2", + "fr_al_oh", + "chi2n", + "qed", + "fr_c_o_nocoo", + "fr_phenol_noorthohbond", + "slogp_vsa5", + "fr_thiazole", + "apol", + "fr_priamide", + "vsa_estate6", + "smr_vsa9", + "slogp_vsa11", + "bcut2d_mrlow", + "fr_coo2", + "fr_nitro_arom", + "slogp_vsa7", + "bcut2d_mwlow", + "fr_alkyl_carbamate", + "bpol", + "fr_prisulfonamd", + "estate_vsa9", + "fr_thiocyan", + "fr_nitro_arom_nonortho", + "heavyatomcount", + "numaliphaticrings", + "vsa_estate10", + "fr_aniline", + "fr_quatn", + "smr_vsa10", + "fr_aldehyde", + "numhdonors", + "fr_hdrzine", + "estate_vsa4", + "fr_allylic_oxid", + "fr_furan", + "fr_term_acetylene", + "fr_phos_acid", + "bcut2d_chglo", + "fr_c_s", + "numheteroatoms", + "fr_ar_coo", + "fr_imine", + "fr_nh0", + "fr_piperzine", + "estate_vsa5", + "numsaturatedrings", + "fr_al_oh_notert", + "estate_vsa2", + "chi0", + "chi1n", + "peoe_vsa12", + "smr_vsa7", + "vsa_estate8", + "fr_phenol", + "slogp_vsa9", + "minpartialcharge", + "numsaturatedheterocycles", + "numaliphaticheterocycles", + "fr_methoxy", + "fr_aryl_methyl", + "rotratio", + "peoe_vsa6", + "peoe_vsa13", + "fr_para_hydroxylation", + "fr_hdrzone", + "fr_ar_nh", + "fr_benzene", + "smr_vsa1", + "estate_vsa6", + "numaromaticcarbocycles", + "fr_guanido", + "smr_vsa3", + "minabspartialcharge", + "smr_vsa5", + "fr_nh1", + "fr_imidazole", + "vsa_estate1", + "peoe_vsa4", + "fr_halogen", + "fr_epoxide", + "fr_morpholine", + "peoe_vsa5", + "fr_c_o", + "fr_ar_n", + "fr_n_o", + "fr_amidine", + "fr_barbitur", + "fr_hoccn", + "heavyatommolwt", + "exactmolwt", + "vsa_estate9", + "chi3n", + "fr_coo", + "fr_arn", + "fr_isocyan", + "fr_dihydropyridine", + "tpsa", + "fr_phos_ester", + "maxabsestateindex", + "fr_al_coo", + "nrot", + "slogp_vsa10", + "slogp_vsa8", + "fr_azo", + "fr_sulfone", + "numradicalelectrons", + "estate_vsa7", + "fr_ketone_topliss", + "smr_vsa4", + "fr_amide", + "molmr", + "molwt", + "bcut2d_logplow", + "bcut2d_mrhi", + "fractioncsp3", + "fr_bicyclic", + "vsa_estate3", + "bcut2d_logphi", + "fr_azide", + "fr_nitroso", + "numvalenceelectrons", + "slogp_vsa1", + "chi2v", + "fr_ketone", + "estate_vsa1", + "peoe_vsa2", + "fr_lactam", + "fr_nhpyrrole", + "fr_unbrch_alkane", + "numaromaticrings", + "slogp_vsa3", + "numaliphaticcarbocycles", + "vsa_estate4", + "fr_nitrile", + "peoe_vsa7", + "numrotatablebonds", + "nocount", + "fr_tetrazole", + "chi4v", + "fr_imide", + "fpdensitymorgan2", + "fr_sulfonamd", + "chi4n", + "kappa3", + "fr_sulfide", + "ringcount", + "nbase", + "balabanj", + "fr_diazo", + "peoe_vsa8", + "vsa_estate5", + "estate_vsa10", + "chi3v", + "estate_vsa8", + "vsa_estate2", + "chi0v", + "fr_benzodiazepine", + "fr_ester", + "slogp_vsa12", + "numhacceptors", + "fr_urea", + "chi1v", + "maxpartialcharge", + "slogp_vsa2", + "fr_lactone", + "fr_pyridine", + "peoe_vsa14", + "peoe_vsa9", + "mollogp", + "fr_ar_oh", + "peoe_vsa1", + "numsaturatedcarbocycles", + "nacid", + "fr_ndealkylation2", + "chi0n", + "naromatom", + "fr_alkyl_halide", + "chi1", + "fr_isothiocyan", + "hallkieralpha", + "peoe_vsa11", +] + + +# Get FeatureSet +full_fs = FeatureSet(fs_name) + +# placeholder for now will be using chem-utils helper function to verify tags +assay_type = "solubility" +tags_list = ["Nightly", f"assay:{assay_type}"] + +################# CREATE MODEL, DEPLOY ENDPOINT, INFERENCE ################# +sw_model = full_fs.to_model( + model_type=ModelType.CLASSIFIER, + target_column=target[0], + feature_list=features, + name=model_name, + tags=tags_list, +) +print("Model Creation Complete") + +sw_endpoint = sw_model.to_endpoint(name=model_name, tags="nightly", serverless=True) +print(f"Endpoint Deployment Complete") +sw_endpoint = Endpoint(model_name) + +# Now run inference on the endpoint +results_df = sw_endpoint.auto_inference(capture=True) +print(f"Endpoint Holdout Inference Ran") diff --git a/glue_jobs/glue_job_pipeline.py b/glue_jobs/glue_job_pipeline.py new file mode 100644 index 000000000..81c4705a8 --- /dev/null +++ b/glue_jobs/glue_job_pipeline.py @@ -0,0 +1,107 @@ +"""Create Solubility Models in AWS/SageWorks + +We're using Pipelines to create different set of ML Artifacts in SageWorks +""" + +import sys +import pandas as pd +import numpy as np +import logging + +from sageworks.api import DataSource, FeatureSet, Model, Endpoint +from sageworks.api.model import ModelType +from sageworks.core.transforms.data_to_features.light.molecular_descriptors import MolecularDescriptors +from sageworks.api.pipeline import Pipeline + +from sageworks.utils.config_manager import ConfigManager +from sageworks.utils.glue_utils import glue_args_to_dict + +# Convert Glue Job Args to a Dictionary +glue_args = glue_args_to_dict(sys.argv) + +# Set the SageWorks Config (needs to be done early) +cm = ConfigManager() +cm.set_config("SAGEWORKS_BUCKET", glue_args["--sageworks-bucket"]) +cm.set_config("REDIS_HOST", glue_args["--redis-host"]) +log = logging.getLogger("sageworks") + +# Set our pipeline +pipeline_name = "test_solubility_class_nightly_100_v0" + + +# A bit of specific processing (maybe put in utils or something) +def solubility_processing(df): + # Remove 'weird' solubility values + log.important("Removing 'weird' values from the solubility data") + log.important(f"Original Shape: {df.shape}") + df = df[df["udm_asy_res_value"] != 4.7] + df = df[df["udm_asy_res_value"] != 0] + log.important(f"New Shape: {df.shape}") + + # Compute the log of the solubility + df["udm_asy_res_value"] = df["udm_asy_res_value"].replace(0, 1e-10) + df["log_s"] = np.log10(df["udm_asy_res_value"] / 1e6) + + # Create a solubility classification column + bins = [-float("inf"), -5, -4, float("inf")] + labels = ["low", "medium", "high"] + df["solubility_class"] = pd.cut(df["log_s"], bins=bins, labels=labels) + return df + + +if __name__ == "__main__": + + # Grab all the information from the Pipeline (as a dictionary) + pipe = Pipeline(pipeline_name).pipeline + + # Get all the pipeline information + id_column = pipe["data_source"]["id_column"] + model_features = pipe["model"]["feature_list"] + data_source_input = pipe["data_source"]["input"] + data_source_name = pipe["data_source"]["name"] + data_source_tags = pipe["data_source"]["tags"] + feature_set_name = pipe["feature_set"]["name"] + feature_set_tags = pipe["feature_set"]["tags"] + holdout = pipe["feature_set"]["holdout"] + model_name = pipe["model"]["name"] + model_type_str = pipe["model"]["model_type"] + model_tags = pipe["model"]["tags"] + model_target = pipe["model"]["target_column"] + endpoint_name = pipe["endpoint"]["name"] + endpoint_tags = pipe["endpoint"]["tags"] + pipeline_name = pipe["pipeline"] + + # Grab the data from the input DataSource + df = DataSource(data_source_input).pull_dataframe() + + # A bit of specific processing + df = solubility_processing(df) + + # Now we'll create the DataSource with the new column + DataSource(df, name=data_source_name, tags=data_source_tags) + + # Molecular Descriptor Artifacts + rdkit_features = MolecularDescriptors(data_source_name, feature_set_name) + rdkit_features.set_output_tags(feature_set_tags) + rdkit_features.transform(id_column=id_column) + + # Set the holdout ids for the FeatureSet (not needed for this example) + fs = FeatureSet(feature_set_name) + + # Create the Model + model_type = ModelType(model_type_str) + feature_set = FeatureSet(feature_set_name) + feature_set.to_model( + model_type, + target_column=model_target, + name=model_name, + feature_list=model_features, + tags=model_tags, + ) + + # Create the Endpoint + m = Model(model_name) + m.set_pipeline(pipeline_name) + m.to_endpoint(name=endpoint_name, tags=endpoint_tags) + end = Endpoint(endpoint_name) + end.auto_inference(capture=True) diff --git a/scripts/glue_mixed_case.py b/glue_jobs/glue_mixed_case.py similarity index 100% rename from scripts/glue_mixed_case.py rename to glue_jobs/glue_mixed_case.py diff --git a/scripts/sol_feature_resolution.py b/scripts/sol_feature_resolution.py new file mode 100644 index 000000000..73b4a85eb --- /dev/null +++ b/scripts/sol_feature_resolution.py @@ -0,0 +1,69 @@ +"""Example Script for the FeatureResolution Class""" + +import numpy as np +import pandas as pd + +from sageworks.api.feature_set import FeatureSet +from sageworks.algorithms.dataframe.feature_resolution import FeatureResolution + +# Grab a test dataframe +fs = FeatureSet("solubility_featurized_ds") +test_df = fs.pull_dataframe() + +# Convert to logS +# Note: This will make 0 -> -16 +test_df["udm_asy_res_value"] = test_df["udm_asy_res_value"].replace(0, 1e-10) +test_df["log_s"] = np.log10(test_df["udm_asy_res_value"] / 1e6) + +target_column = "log_s" +meta = [ + "write_time", + "api_invocation_time", + "is_deleted", + "udm_asy_protocol", + "udm_asy_cnd_format", + "std_dev", + "count", + "udm_mol_id", + "udm_asy_date", + "udm_prj_code", + "udm_asy_cnd_target", + "udm_asy_cnd_time_hr", + "smiles", + "udm_mol_bat_slt_smiles", + "udm_mol_bat_slv_smiles", + "operator", + "class", + "event_time", +] +exclude = ["log_s", "udm_asy_res_value", "udm_mol_bat_id"] + meta +feature_columns = [c for c in test_df.columns if c not in exclude] +print(f"Num Features {len(feature_columns)}") + +output_columns = [ + "smiles", + "udm_asy_protocol", + "udm_asy_cnd_target", + "udm_asy_cnd_format", + "udm_asy_date", + "udm_prj_code", + "udm_mol_bat_slt_ratio", + "udm_mol_bat_slt_smiles", + "udm_mol_bat_slv_ratio", + "udm_mol_bat_slv_smiles", + "udm_asy_cnd_time_hr", +] + +# Create the class and run the report +resolution = FeatureResolution( + test_df, + features=feature_columns, + target_column=target_column, + id_column="udm_mol_bat_id", +) +output_df = resolution.compute(within_distance=0.01, min_target_difference=2.0, output_columns=output_columns) + +# Print the output +pd.options.display.max_columns = None +pd.options.display.width = 1000 +print(output_df.head())