diff --git a/pyproject.toml b/pyproject.toml index 5070616..c6d9c6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,9 @@ meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main" meds-tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main" meds-tab-tabularize-time-series = "MEDS_tabular_automl.scripts.tabularize_time_series:main" meds-tab-cache-task = "MEDS_tabular_automl.scripts.cache_task:main" -meds-tab-xgboost = "MEDS_tabular_automl.scripts.launch_xgboost:main" +meds-tab-xgboost = "MEDS_tabular_automl.scripts.launch_model:main" +meds-tab-model = "MEDS_tabular_automl.scripts.launch_model:main" +meds-tab-autogluon = "MEDS_tabular_automl.scripts.launch_autogluon:main" generate-subsets = "MEDS_tabular_automl.scripts.generate_subsets:main" diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml index d9a9b74..c11d116 100644 --- a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml +++ b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml @@ -1,6 +1,8 @@ defaults: - default - tabularization: default + - imputer: default + - normalization: default - override hydra/sweeper: optuna - override hydra/sweeper/sampler: tpe - override hydra/launcher: joblib diff --git a/src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml b/src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml deleted file mode 100644 index 805593e..0000000 --- a/src/MEDS_tabular_automl/configs/launch_sklearnmodel.yaml +++ /dev/null @@ -1,33 +0,0 @@ -defaults: - - default - - tabularization: default - - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe - - override hydra/launcher: joblib - - _self_ - -task_name: task - -# Task cached data dir -input_dir: ${output_cohort_dir}/${task_name}/task_cache -# Directory with task labels -input_label_dir: ${output_cohort_dir}/${task_name}/labels/ -# Where to output the model and cached data -model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} -output_filepath: ${model_dir}/model_metadata.json - -# Model parameters -model_params: - epochs: 20 - early_stopping_rounds: 5 - model: - _target_: sklearn.linear_model.SGDClassifier - loss: log_loss - # n_iter: ${model_params.epochs} # not sure if we want this behaviour - iterator: - keep_data_in_memory: True - binarize_task: True - -log_dir: ${model_dir}/.logs/ - -name: launch_sklearnmodel diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml deleted file mode 100644 index a95187b..0000000 --- a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml +++ /dev/null @@ -1,58 +0,0 @@ -defaults: - - default - - tabularization: default - - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe - - override hydra/launcher: joblib - - _self_ - -task_name: task - -# Task cached data dir -input_dir: ${output_cohort_dir}/${task_name}/task_cache -# Directory with task labels -input_label_dir: ${output_cohort_dir}/${task_name}/labels/ -# Where to output the model and cached data -model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} -output_filepath: ${model_dir}/model_metadata.json - -# Model parameters -model_params: - num_boost_round: 1000 - early_stopping_rounds: 5 - model: - booster: gbtree - device: cpu - nthread: 1 - tree_method: hist - objective: binary:logistic - iterator: - keep_data_in_memory: True - binarize_task: True - -log_dir: ${model_dir}/.logs/ - -hydra: - # Optuna Sweeper - sweeper: - sampler: - seed: 1 - study_name: null #study_${now:%Y-%m-%d_%H-%M-%S} - storage: null - direction: maximize - n_trials: 250 - n_jobs: 25 - - # Define search space for Optuna - params: - +model_params.model.eta: tag(log, interval(0.001, 1)) - +model_params.model.lambda: tag(log, interval(0.001, 1)) - +model_params.model.alpha: tag(log, interval(0.001, 1)) - +model_params.model.subsample: interval(0.5, 1) - +model_params.model.min_child_weight: interval(1e-2, 100) - model_params.num_boost_round: range(100, 1000) - model_params.early_stopping_rounds: range(1, 10) - +model_params.model.max_depth: range(2, 16) - tabularization.min_code_inclusion_count: tag(log, range(10, 1000000)) - -name: launch_xgboost diff --git a/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py b/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py deleted file mode 100644 index 8e76872..0000000 --- a/src/MEDS_tabular_automl/scripts/launch_sklearnmodel.py +++ /dev/null @@ -1,58 +0,0 @@ -from importlib.resources import files -from pathlib import Path - -import hydra -from loguru import logger -from omegaconf import DictConfig - -from ..sklearn_model import SklearnModel -from ..utils import hydra_loguru_init - -config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_sklearnmodel.yaml") -if not config_yaml.is_file(): - raise FileNotFoundError("Core configuration not successfully installed!") - - -@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) -def main(cfg: DictConfig) -> float: - """Optimizes the model based on the provided configuration. - - Args: - cfg: The configuration dictionary specifying model and training parameters. - - Returns: - The evaluation result as the ROC AUC score on the held-out test set. - """ - - # print(OmegaConf.to_yaml(cfg)) - if not cfg.loguru_init: - hydra_loguru_init() - try: - model = SklearnModel(cfg) - model.train() - auc = model.evaluate() - logger.info(f"AUC: {auc}") - - # print( - # "Time Profiling for window sizes ", - # f"{cfg.tabularization.window_sizes} and min ", - # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", - # ) - # print("Train Time: \n", model._profile_durations()) - # print("Train Iterator Time: \n", model.itrain._profile_durations()) - # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) - # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) - - # save model - output_fp = Path(cfg.output_filepath) - output_fp.parent.mkdir(parents=True, exist_ok=True) - - model.save_model(output_fp) - except Exception as e: - logger.error(f"Error occurred: {e}") - auc = 0.0 - return auc - - -if __name__ == "__main__": - main() diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py deleted file mode 100644 index fd09e70..0000000 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ /dev/null @@ -1,58 +0,0 @@ -from importlib.resources import files -from pathlib import Path - -import hydra -from loguru import logger -from omegaconf import DictConfig - -from ..utils import hydra_loguru_init -from ..xgboost_model import XGBoostModel - -config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") -if not config_yaml.is_file(): - raise FileNotFoundError("Core configuration not successfully installed!") - - -@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) -def main(cfg: DictConfig) -> float: - """Optimizes the model based on the provided configuration. - - Args: - cfg: The configuration dictionary specifying model and training parameters. - - Returns: - The evaluation result as the ROC AUC score on the held-out test set. - """ - - # print(OmegaConf.to_yaml(cfg)) - if not cfg.loguru_init: - hydra_loguru_init() - try: - model = XGBoostModel(cfg) - model.train() - auc = model.evaluate() - logger.info(f"AUC: {auc}") - - # print( - # "Time Profiling for window sizes ", - # f"{cfg.tabularization.window_sizes} and min ", - # f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:", - # ) - # print("Train Time: \n", model._profile_durations()) - # print("Train Iterator Time: \n", model.itrain._profile_durations()) - # print("Tuning Iterator Time: \n", model.ituning._profile_durations()) - # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations()) - - # save model - output_fp = Path(cfg.output_filepath) - output_fp.parent.mkdir(parents=True, exist_ok=True) - - model.save_model(output_fp) - except Exception as e: - logger.error(f"Error occurred: {e}") - auc = 0.0 - return auc - - -if __name__ == "__main__": - main() diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 5e14f91..ff918fe 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -224,7 +224,10 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr def _set_imputer(self): """Sets the imputer for the data.""" - if self.cfg.model_params.iterator.imputer.imputer_target: + if ( + hasattr(self.cfg.model_params.iterator, "imputer") + and self.cfg.model_params.iterator.imputer.imputer_target + ): imputer = self.cfg.model_params.iterator.imputer.imputer_target if hasattr(imputer, "partial_fit"): for i in range(len(self._data_shards)): @@ -240,7 +243,10 @@ def _set_imputer(self): def _set_scaler(self): """Sets the scaler for the data.""" - if self.cfg.model_params.iterator.normalization.normalizer: + if ( + hasattr(self.cfg.model_params.iterator, "normalization") + and self.cfg.model_params.iterator.normalization.normalizer + ): scaler = self.cfg.model_params.iterator.normalization.normalizer if hasattr(scaler, "partial_fit"): for i in range(len(self._data_shards)): diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index ef2582e..9a48041 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -2,7 +2,9 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) +import importlib.util import json +import os import subprocess import tempfile from io import StringIO @@ -370,25 +372,31 @@ def test_tabularize(): output_files = list(output_dir.glob("**/*.pkl")) assert len(output_files) == 1 - # autogluon_config_kwargs = { - # **shared_config, - # "tabularization.min_code_inclusion_count": 1, - # "tabularization.window_sizes": "[30d,365d,full]", - # "model_params.iterator.keep_data_in_memory": False, - # "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", - # } + if importlib.util.find_spec("autogluon") is not None: + import autogluon as ag - # with initialize( - # version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - # ): # path to config.yaml - # overrides = [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] - # cfg = compose(config_name="launch_sklearnmodel", overrides=overrides) # config.yaml + from MEDS_tabular_automl.scripts import launch_autogluon - # output_dir = Path(cfg.output_cohort_dir) / "model_online" + autogluon_config_kwargs = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "model_params.iterator.keep_data_in_memory": False, + "model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + } - # launch_model.main(cfg) - # output_files = list(output_dir.glob("**/*.pkl")) - # assert len(output_files) == 1 + with initialize( + version_base=None, config_path="../src/MEDS_tabular_automl/configs/" + ): # path to config.yaml + overrides = [f"{k}={v}" for k, v in autogluon_config_kwargs.items()] + cfg = compose(config_name="launch_autogluon", overrides=overrides) # config.yaml + + output_dir = Path(cfg.output_cohort_dir) / "model_online" + + launch_autogluon.main(cfg) + output_files = list(output_dir.glob("*")) + most_recent_file = max(output_files, key=os.path.getmtime) + ag.tabular.TabularPredictor.load(most_recent_file) def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): @@ -421,5 +429,5 @@ def test_xgboost_config(): version_base=None, config_path="../src/MEDS_tabular_automl/configs/" ): # path to config.yaml overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] - cfg = compose(config_name="launch_xgboost", overrides=overrides) # config.yaml + cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml assert cfg.tabularization.window_sizes