diff --git a/src/MEDS_tabular_automl/base_model.py b/src/MEDS_tabular_automl/base_model.py new file mode 100644 index 0000000..9f30a07 --- /dev/null +++ b/src/MEDS_tabular_automl/base_model.py @@ -0,0 +1,23 @@ +from typing import Dict, Type +from abc import ABC, abstractmethod +from pathlib import Path +from omegaconf import DictConfig +from mixins import TimeableMixin + + +class BaseModel(ABC, TimeableMixin): + @abstractmethod + def __init__(self): + pass + + @abstractmethod + def train(self): + pass + + @abstractmethod + def evaluate(self) -> float: + pass + + @abstractmethod + def save_model(self, output_fp: Path): + pass diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml new file mode 100644 index 0000000..71fcc14 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -0,0 +1,22 @@ +defaults: + - default + - tabularization: default + - model: xgboost # This can be changed to sgd_classifier or any other model + - override hydra/sweeper: optuna + - override hydra/sweeper/sampler: tpe + - override hydra/launcher: joblib + - _self_ + +task_name: task + +# Task cached data dir +input_dir: ${output_cohort_dir}/${task_name}/task_cache +# Directory with task labels +input_label_dir: ${output_cohort_dir}/${task_name}/labels/ +# Where to output the model and cached data +model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} +output_filepath: ${model_dir}/model_metadata.json + +log_dir: ${model_dir}/.logs/ + +name: launch_model \ No newline at end of file diff --git a/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml new file mode 100644 index 0000000..1b05f15 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml @@ -0,0 +1,19 @@ +model_params: + epochs: 20 + early_stopping_rounds: 5 + model: + type: sklearn + _target_: sklearn.linear_model.SGDClassifier + loss: log_loss + iterator: + keep_data_in_memory: True + binarize_task: True + +hydra: + sweeper: + params: + +model_params.model.alpha: tag(log, interval(1e-6, 1)) + +model_params.model.l1_ratio: interval(0, 1) + +model_params.model.penalty: choice(['l1', 'l2', 'elasticnet']) + model_params.epochs: range(10, 100) + model_params.early_stopping_rounds: range(1, 10) \ No newline at end of file diff --git a/src/MEDS_tabular_automl/configs/models/xgboost.yaml b/src/MEDS_tabular_automl/configs/models/xgboost.yaml new file mode 100644 index 0000000..a4be06e --- /dev/null +++ b/src/MEDS_tabular_automl/configs/models/xgboost.yaml @@ -0,0 +1,27 @@ +model_params: + num_boost_round: 1000 + early_stopping_rounds: 5 + model: + type: xgboost + # _target_: xgboost.XGBClassifier + booster: gbtree + device: cpu + nthread: 1 + tree_method: hist + objective: binary:logistic + iterator: + keep_data_in_memory: True + binarize_task: True + +hydra: + sweeper: + params: + +model_params.model.eta: tag(log, interval(0.001, 1)) + +model_params.model.lambda: tag(log, interval(0.001, 1)) + +model_params.model.alpha: tag(log, interval(0.001, 1)) + +model_params.model.subsample: interval(0.5, 1) + +model_params.model.min_child_weight: interval(1e-2, 100) + model_params.num_boost_round: range(100, 1000) + model_params.early_stopping_rounds: range(1, 10) + +model_params.model.max_depth: range(2, 16) + tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) \ No newline at end of file diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml index 5166b91..a4ffdc6 100644 --- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml @@ -2,7 +2,7 @@ filtered_code_metadata_fp: ${output_cohort_dir}/tabularized_code_metadata.parquet allowed_codes: null min_code_inclusion_count: 10 -min_code_inclusion_frequency: 0.01 +min_code_inclusion_frequency: null max_included_codes: null window_sizes: - "1d" @@ -21,4 +21,4 @@ aggs: - "value/max" # Resolved inputs -_resolved_codes: ${filter_to_codes:${tabularization.filtered_code_metadata_fp},${tabularization.allowed_codes},${tabularization.min_code_inclusion_count},$`{tabularization.min_code_inclusion_frequency},${tabularization.max_included_codes}}`} +_resolved_codes: ${filter_to_codes:${tabularization.filtered_code_metadata_fp},${tabularization.allowed_codes},${tabularization.min_code_inclusion_count},${tabularization.min_code_inclusion_frequency},${tabularization.max_included_codes}} diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py new file mode 100644 index 0000000..c9c6485 --- /dev/null +++ b/src/MEDS_tabular_automl/dense_iterator.py @@ -0,0 +1,54 @@ +from pathlib import Path + +import hydra +import numpy as np +import scipy.sparse as sp +from loguru import logger +from mixins import TimeableMixin +from omegaconf import DictConfig +from sklearn.metrics import roc_auc_score + +from .tabular_dataset import TabularDataset +from .base_model import BaseModel + + +class DenseIterator(TabularDataset, TimeableMixin): + + def __init__(self, cfg: DictConfig, split: str): + """Initializes the SklearnIterator with the provided configuration and data split. + + Args: + cfg: The configuration dictionary. + split: The data split to use. + """ + TabularDataset.__init__(self, cfg=cfg, split=split) + TimeableMixin.__init__(self) + self.valid_event_ids, self.labels = self._load_ids_and_labels() + # check if the labels are empty + if len(self.labels) == 0: + raise ValueError("No labels found.") + # self._it = 0 + + def densify(self) -> np.ndarray: + """Builds the data as a dense matrix based on column subselection.""" + + # get the column indices to include + cols = self.get_feature_indices() + + # map those to the feature names in the data + feature_names = self.get_all_column_names() + selected_features = [feature_names[col] for col in cols] + + # get the dense matrix by iterating through the data shards + data = [] + labels = [] + for shard_idx in range(len(self._data_shards)): + shard_data, shard_labels = self.get_data_shards(shard_idx) + shard_data = shard_data[:, cols] + data.append(shard_data) + labels.append(shard_labels) + data = sp.vstack(data) + labels = np.concatenate(labels, axis=0) + return data, labels, selected_features + + diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py new file mode 100644 index 0000000..ac11c3c --- /dev/null +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -0,0 +1,60 @@ +from importlib.resources import files +from pathlib import Path + +import hydra +import pandas as pd +from loguru import logger +from omegaconf import DictConfig + +from MEDS_tabular_automl.dense_iterator import DenseIterator + +from ..utils import hydra_loguru_init + +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") + + +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) +def main(cfg: DictConfig) -> float: + """Launches AutoGluon after collecting data based on the provided configuration. + + Args: + cfg: The configuration dictionary specifying model and training parameters. + + """ + + # print(OmegaConf.to_yaml(cfg)) + if not cfg.loguru_init: + hydra_loguru_init() + + # check that autogluon is installed + try: + import autogluon as ag + except ImportError: + logger.error("AutoGluon is not installed. Please install AutoGluon.") + + # collect data based on the configuration + itrain = DenseIterator(cfg, "train") + ituning = DenseIterator(cfg, "tuning") + iheld_out = DenseIterator(cfg, "held_out") + + # collect data for AutoGluon + train_data, train_labels, cols = itrain.densify() + tuning_data, tuning_labels, _ = ituning.densify() + held_out_data, held_out_labels, _ = iheld_out.densify() + + # construct dfs for AutoGluon + train_df = pd.DataFrame(train_data.todense(), columns=cols) + train_df[cfg.task_name] = train_labels + tuning_df = pd.DataFrame(tuning_data.todense(), columns=cols) + tuning_df[cfg.task_name] = tuning_labels + held_out_df = pd.DataFrame(held_out_data.todense(), columns=cols) + held_out_df[cfg.task_name] = held_out_labels + + # launch AutoGluon + predictor = ag.TabularPredictor(label=cfg.task_name).fit(train_data=train_df, tuning_data=tuning_df) + + +if __name__ == "__main__": + main() diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py new file mode 100644 index 0000000..7e7fbf8 --- /dev/null +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -0,0 +1,63 @@ +from importlib.resources import files +from pathlib import Path + +import hydra +from loguru import logger +from omegaconf import DictConfig +from typing import Dict, Type + +from MEDS_tabular_automl.base_model import BaseModel +from MEDS_tabular_automl.sklearn_model import SklearnModel +from MEDS_tabular_automl.xgboost_model import XGBoostModel + + +MODEL_CLASSES: Dict[str, Type[BaseModel]] = { + "xgboost": XGBoostModel, + "sklearn": SklearnModel +} + +from ..utils import hydra_loguru_init + +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") +if not config_yaml.is_file(): + raise FileNotFoundError("Core configuration not successfully installed!") + + +@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) +def main(cfg: DictConfig) -> float: + """Optimizes the model based on the provided configuration. + + Args: + cfg: The configuration dictionary specifying model and training parameters. + + Returns: + The evaluation result as the ROC AUC score on the held-out test set. + """ + + # print(OmegaConf.to_yaml(cfg)) + if not cfg.loguru_init: + hydra_loguru_init() + try: + model_type = cfg.model.type + ModelClass = MODEL_CLASSES.get(model_type) + if ModelClass is None: + raise ValueError(f"Model type {model_type} not supported.") + + model = ModelClass(cfg) + model.train() + auc = model.evaluate() + logger.info(f"AUC: {auc}") + + # save model + output_fp = Path(cfg.output_filepath) + output_fp.parent.mkdir(parents=True, exist_ok=True) + + model.save_model(output_fp) + except Exception as e: + logger.error(f"Error occurred: {e}") + auc = 0.0 + return auc + + +if __name__ == "__main__": + main() diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index fd09e70..22d10c3 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -27,30 +27,30 @@ def main(cfg: DictConfig) -> float: # print(OmegaConf.to_yaml(cfg)) if not cfg.loguru_init: hydra_loguru_init() - try: - model = XGBoostModel(cfg) - model.train() - auc = model.evaluate() - logger.info(f"AUC: {auc}") - - # print( - # "Time Profiling for window sizes ", - # f"{cfg.tabularization.window_sizes} and min ", - # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", - # ) - # print("Train Time: \n", model._profile_durations()) - # print("Train Iterator Time: \n", model.itrain._profile_durations()) - # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) - # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) - - # save model - output_fp = Path(cfg.output_filepath) - output_fp.parent.mkdir(parents=True, exist_ok=True) - - model.save_model(output_fp) - except Exception as e: - logger.error(f"Error occurred: {e}") - auc = 0.0 + # try: + model = XGBoostModel(cfg) + model.train() + auc = model.evaluate() + logger.info(f"AUC: {auc}") + + # print( + # "Time Profiling for window sizes ", + # f"{cfg.tabularization.window_sizes} and min ", + # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", + # ) + # print("Train Time: \n", model._profile_durations()) + # print("Train Iterator Time: \n", model.itrain._profile_durations()) + # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) + # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) + + # save model + output_fp = Path(cfg.output_filepath) + output_fp.parent.mkdir(parents=True, exist_ok=True) + + model.save_model(output_fp) + # except Exception as e: + # logger.error(f"Error occurred: {e}") + # auc = 0.0 return auc diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index e1aa0ee..34d9c0d 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -86,7 +86,6 @@ def main( hydra_loguru_init() # Step 1: Cache the filtered features that will be used in the tabularization process and modeling - # import pdb; pdb.set_trace() def read_fn(_): return _ diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index b660123..e8c9f6a 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -9,6 +9,7 @@ from sklearn.metrics import roc_auc_score from .tabular_dataset import TabularDataset +from .base_model import BaseModel class SklearnIterator(TabularDataset, TimeableMixin): @@ -77,7 +78,7 @@ def get_label(self): return self.labels -class SklearnModel(TimeableMixin): +class SklearnModel(BaseModel, TimeableMixin): """Class for configuring, training, and evaluating an SciKit-Learn model. This class utilizes the configuration settings provided to manage the training and evaluation @@ -86,7 +87,7 @@ class SklearnModel(TimeableMixin): disk using iterators. Args: - cfg: The configuration settings for the model, including data paths, model parameters, + cfg: The configuration settings for the model, including data paths, model parameters,ß and flags for data handling. Attributes: @@ -107,6 +108,7 @@ def __init__(self, cfg: DictConfig): Args: cfg: The configuration dictionary. """ + super().__init__() self.cfg = cfg self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index badb246..eb4d4d1 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -108,7 +108,6 @@ def load_tqdm(use_tqdm: bool): return tqdm else: - def noop(x, **kwargs): return x diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index 371d247..9f0f119 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -9,6 +9,7 @@ from sklearn.metrics import roc_auc_score from .tabular_dataset import TabularDataset +from .base_model import BaseModel class XGBIterator(xgb.DataIter, TabularDataset, TimeableMixin): @@ -78,7 +79,7 @@ def reset(self): self._it = 0 -class XGBoostModel(TimeableMixin): +class XGBoostModel(BaseModel, TimeableMixin): """Class for configuring, training, and evaluating an XGBoost model. This class utilizes the configuration settings provided to manage the training and evaluation @@ -108,6 +109,7 @@ def __init__(self, cfg: DictConfig): Args: cfg: The configuration dictionary. """ + super().__init__() self.cfg = cfg self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory diff --git a/tests/test_integration.py b/tests/test_integration.py index d22eac5..81336ed 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -112,7 +112,7 @@ def test_integration(): # Step 2: Run the static data tabularization script tabularize_config = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } stderr, stdout = run_command( @@ -161,7 +161,7 @@ def test_integration(): # Step 3: Run the time series tabularization script tabularize_config = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } @@ -203,7 +203,7 @@ def test_integration(): # Step 4: Run the task_specific_caching script cache_config = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } with initialize( diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index 7eacb1b..931b7a1 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -211,7 +211,7 @@ def test_tabularize(): # Step 2: Tabularization tabularize_static_config = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } @@ -289,7 +289,7 @@ def test_tabularize(): # Step 3: Cache Task data cache_config = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } @@ -313,7 +313,7 @@ def test_tabularize(): xgboost_config_kwargs = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } @@ -329,17 +329,17 @@ def test_tabularize(): output_files = list(output_dir.glob("**/*.json")) assert len(output_files) == 1 - basemodel_config_kwargs = { + sklearnmodel_config_kwargs = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", } with initialize( version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in basemodel_config_kwargs.items()] - cfg = compose(config_name="launch_basemodel", overrides=overrides) # config.yaml + overrides = [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] + cfg = compose(config_name="launch_sklearnmodel", overrides=overrides) # config.yaml output_dir = Path(cfg.output_cohort_dir) / "model" @@ -347,9 +347,9 @@ def test_tabularize(): output_files = list(output_dir.glob("**/*.pkl")) assert len(output_files) == 1 - basemodel_config_kwargs = { + sklearnmodel_config_kwargs = { **shared_config, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", "model_params.iterator.keep_data_in_memory": False, "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", @@ -358,8 +358,8 @@ def test_tabularize(): with initialize( version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in basemodel_config_kwargs.items()] - cfg = compose(config_name="launch_basemodel", overrides=overrides) # config.yaml + overrides = [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] + cfg = compose(config_name="launch_sklearnmodel", overrides=overrides) # config.yaml output_dir = Path(cfg.output_cohort_dir) / "model_online" @@ -390,7 +390,7 @@ def test_xgboost_config(): "hydra.verbose": True, "tqdm": False, "loguru_init": True, - "tabularization.min_code_inclusion_frequency": 1, + "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": f"{stdout_ws.strip()}", }