update xgboost sweep

mmcdermott · Jun 9, 2024 · 61e39df · 61e39df
1 parent 125e3e7
commit 61e39df
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 95 deletions.
diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml
@@ -1,6 +1,8 @@
 defaults:
   - default
   - tabularization: default
+  - override hydra/sweeper: optuna
+  - override hydra/sweeper/sampler: tpe
   - _self_
 
 task_name: task
@@ -28,52 +30,24 @@ model_params:
     keep_data_in_memory: True
     binarize_task: True
 
-# Define search space for Optuna
-optuna:
-  study_name: xgboost_sweep_${now:%Y-%m-%d_%H-%M-%S}
-  storage: null
-  load_if_exists: False
-  direction: minimize
-  sampler: null
-  pruner: null
+hydra:
+  verbose: False
+  sweep:
+    dir: ${output_dir}/.logs/
+  run:
+    dir: ${output_dir}/.logs/
 
-  n_trials: 10
-  n_jobs: 1
-  show_progress_bar: False
+  # Optuna Sweeper
+  sweeper:
+    sampler:
+      seed: 1
+    study_name: null #study_${now:%Y-%m-%d_%H-%M-%S}
+    storage: null
+    direction: minimize
+    n_trials: 10
 
-  params:
-    suggest_categorical:
-      window_sizes: ${generate_permutations:${tabularization.window_sizes}}
-      aggs: ${generate_permutations:${tabularization.aggs}}
-    suggest_float:
-      eta:
-        low: .001
-        high: 1
-        log: True
-      lambda:
-        low: .001
-        high: 1
-        log: True
-      alpha:
-        low: .001
-        high: 1
-        log: True
-      subsample:
-        low: 0.5
-        high: 1
-      min_child_weight:
-        low: 1e-2
-        high: 100
-    suggest_int:
-      num_boost_round:
-        low: 10
-        high: 1000
-      max_depth:
-        low: 2
-        high: 16
-      min_code_inclusion_frequency:
-        low: 10
-        high: 1_000_000
-        log: True
+    # Define search space for Optuna
+    params:
+      tabularization.window_sizes: choice([30d], [30d, 365d], [365d, full])
 
 name: launch_xgboost
diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py
@@ -1,4 +1,5 @@
 from collections.abc import Callable, Mapping
+from datetime import datetime
 from importlib.resources import files
 from pathlib import Path
 
@@ -12,7 +13,7 @@
 from omegaconf import DictConfig, OmegaConf
 from sklearn.metrics import roc_auc_score
 
-from MEDS_tabular_automl.describe_codes import get_feature_columns, get_feature_freqs
+from MEDS_tabular_automl.describe_codes import get_feature_columns
 from MEDS_tabular_automl.file_name import get_model_files, list_subdir_files
 from MEDS_tabular_automl.utils import get_feature_indices, hydra_loguru_init
 
@@ -188,18 +189,8 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csc_matrix:
 
         dynamic_cscs = [self._load_dynamic_shard_from_file(file, idx) for file in files]
 
-        fn_name = "_get_dynamic_shard_by_index"
-        hstack_key = f"{fn_name}/hstack"
-        self._register_start(key=hstack_key)
-
-        combined_csc = sp.hstack(dynamic_cscs, format="csc")  # TODO: check this
-        # self._register_end(key=hstack_key)
-        # # Filter Rows
-        # valid_indices = self.valid_event_ids[shard_name]
-        # filter_key = f"{fn_name}/filter"
-        # self._register_start(key=filter_key)
-        # out = combined_csc[valid_indices, :]
-        # self._register_end(key=filter_key)
+        combined_csc = sp.hstack(dynamic_cscs, format="csc")
+
         return combined_csc
 
     @TimeableMixin.TimeAs
@@ -388,30 +379,31 @@ def main(cfg: DictConfig) -> float:
     Returns:
     - float: Evaluation result.
     """
+
+    print(OmegaConf.to_yaml(cfg))
     if not cfg.loguru_init:
         hydra_loguru_init()
 
     model = XGBoostModel(cfg)
     model.train()
+    auc = model.evaluate()
+    logger.info(f"AUC: {auc}")
 
     print(
         "Time Profiling for window sizes ",
         f"{cfg.tabularization.window_sizes} and min ",
-        "code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
+        f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
     )
     print("Train Time: \n", model._profile_durations())
-    print("Train Iterator Time: \n", model.itrain._profile_durations())
-    print("Tuning Iterator Time: \n", model.ituning._profile_durations())
-    print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())
+    # print("Train Iterator Time: \n", model.itrain._profile_durations())
+    # print("Tuning Iterator Time: \n", model.ituning._profile_durations())
+    # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())
 
     # save model
     save_dir = Path(cfg.output_dir)
     save_dir.mkdir(parents=True, exist_ok=True)
-
-    logger.info(f"Saving the model to directory: {save_dir}")
-    model.model.save_model(save_dir / "model.json")
-    auc = model.evaluate()
-    logger.info(f"AUC: {auc}")
+    model_time = datetime.now().strftime("%H%M%S%f")
+    model.model.save_model(save_dir / f"{auc:.4f}_model_{model_time}.json")
     return auc
 
 

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -275,14 +275,16 @@ def test_integration():
         )
         output_files = list(Path(cfg.output_dir).parent.glob("**/*.json"))
         assert len(output_files) == 1
-        assert output_files[0].stem == "model"
+        # assert output_files[0].stem == '0.6667_model'
 
         stderr, stdout = run_command(
-            "meds-tab-xgboost-sweep",
-            [],
+            "meds-tab-xgboost",
+            [
+                "--multirun",
+            ],
             xgboost_config_kwargs,
             "xgboost-sweep",
         )
         output_files = list(Path(cfg.output_dir).parent.glob("**/*.json"))
-        assert len(output_files) == 2
-        assert output_files[0].stem == "model"
+        assert len(output_files) == 11
+        # assert output_files[0].stem == "model"
diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py
@@ -17,7 +17,6 @@
     cache_task,
     describe_codes,
     launch_xgboost,
-    sweep_xgboost,
     tabularize_static,
     tabularize_time_series,
 )
@@ -395,25 +394,25 @@ def test_tabularize():
         assert output_files[0] == Path(cfg.output_dir) / "model.json"
         os.remove(Path(cfg.output_dir) / "model.json")
 
-        xgboost_config_kwargs = {
-            "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()),
-            "do_overwrite": False,
-            "seed": 1,
-            "hydra.verbose": True,
-            "tqdm": False,
-            "loguru_init": True,
-            "tabularization.min_code_inclusion_frequency": 1,
-            "tabularization.aggs": "[static/present,static/first,code/count,value/sum]",
-            "tabularization.window_sizes": "[30d,365d,full]",
-        }
-
-        with initialize(
-            version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-        ):  # path to config.yaml
-            overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()]
-            cfg = compose(config_name="launch_xgboost", overrides=overrides)  # config.yaml
-
-        sweep_xgboost.main(cfg)
-        output_files = list(Path(cfg.output_dir).glob("**/*.json"))
-        assert len(output_files) == 1
-        assert output_files[0] == Path(cfg.output_dir) / "model.json"
+        # xgboost_config_kwargs = {
+        #     "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()),
+        #     "do_overwrite": False,
+        #     "seed": 1,
+        #     "hydra.verbose": True,
+        #     "tqdm": False,
+        #     "loguru_init": True,
+        #     "tabularization.min_code_inclusion_frequency": 1,
+        #     "tabularization.aggs": "[static/present,static/first,code/count,value/sum]",
+        #     "tabularization.window_sizes": "[30d,365d,full]",
+        # }
+
+        # with initialize(
+        #     version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
+        # ):  # path to config.yaml
+        #     overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()]
+        #     cfg = compose(config_name="launch_xgboost", overrides=overrides)  # config.yaml
+
+        # launch_xgboost.main(cfg)
+        # output_files = list(Path(cfg.output_dir).glob("**/*.json"))
+        # assert len(output_files) == 1
+        # assert output_files[0] == Path(cfg.output_dir) / "model.json"