simplified pathing for results and evaluation callback

mmcdermott · Sep 10, 2024 · 8316365 · 8316365
1 parent 9e6d99a
commit 8316365
Show file tree

Hide file tree

Showing 15 changed files with 96 additions and 86 deletions.
diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml
@@ -16,15 +16,17 @@ input_label_cache_dir: ${output_dir}/${task_name}/labels
 # Where to output the model and cached data
 output_model_dir: ???
 
+time_output_model_dir: ${output_model_dir}/${now:%Y-%m-%d_%H-%M-%S}
+
 delete_below_top_k: -1
 
 name: launch_model
 
 hydra:
   sweep:
-    dir: ${output_model_dir}/sweeps/${now:%Y-%m-%d-%H-%M-%S}/
+    dir: ${time_output_model_dir}/hydra/
     subdir: "1"
   run:
-    dir: ${path.model_log_dir}
+    dir: ${path.sweep_results_dir}
   sweeper:
     direction: "maximize"
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml b/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml
@@ -1,3 +1,6 @@
 defaults:
   - default
   - _self_
+
+path:
+  model_file_stem: "autogluon"
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml
@@ -15,8 +15,9 @@ model_launcher:
     p: 2
     metric: "minkowski"
 
-  path:
-    model_file_extension: .pkl
+path:
+  model_file_extension: .pkl
+  model_file_stem: "knn_classifier"
 
 hydra:
   sweeper:

diff --git a/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml
@@ -21,8 +21,9 @@ model_launcher:
     solver: "lbfgs"
     max_iter: 100
 
-  path:
-    model_file_extension: .pkl
+path:
+  model_file_extension: .pkl
+  model_file_stem: "logistic_regression"
 
 hydra:
   sweeper:

diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml
@@ -1,10 +1,9 @@
 input_tabularized_cache_dir: ${input_tabularized_cache_dir}
 input_label_cache_dir: ${input_label_cache_dir}
-output_model_dir: ${output_model_dir}
 model_file_stem: model
 model_file_extension: .json
-log_dir: ${log_dir}
 cache_dir: ${cache_dir}
-model_log_dir: ${output_model_dir}/.logs/
+sweep_results_dir: ${time_output_model_dir}/sweep_results/
+best_trial_dir: ${time_output_model_dir}/best_trial/
 performance_log_stem: performance
 config_log_stem: config
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml
@@ -20,8 +20,9 @@ model_launcher:
     min_impurity_decrease: 0.0
     bootstrap: True
 
-  path:
-    model_file_extension: .pkl
+path:
+  model_file_extension: .pkl
+  model_file_stem: "random_forest_classifier"
 
 hydra:
   sweeper:

diff --git a/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml
@@ -11,8 +11,9 @@ model_launcher:
     _target_: sklearn.linear_model.SGDClassifier
     loss: log_loss
 
-  path:
-    model_file_extension: .pkl
+path:
+  model_file_extension: .pkl
+  model_file_stem: "sgd_classifier"
 
 hydra:
   sweeper:

diff --git a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml
@@ -17,6 +17,9 @@ model_launcher:
     num_boost_round: 1000
     early_stopping_rounds: 5
 
+path:
+  model_file_stem: "xgboost"
+
 hydra:
   sweeper:
     params:

diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py
@@ -1,3 +1,4 @@
+import shutil
 from pathlib import Path
 
 import polars as pl
@@ -9,7 +10,7 @@
 class EvaluationCallback(Callback):
     def on_multirun_end(self, config: DictConfig, **kwargs):
         """Find best model based on log files and logger.info its performance and hyperparameters."""
-        log_fp = Path(config.path.model_log_dir)
+        log_fp = Path(config.path.sweep_results_dir)
 
         try:
             performance = pl.read_csv(log_fp / f"*/*{config.path.performance_log_stem}.log")
@@ -22,18 +23,22 @@ def on_multirun_end(self, config: DictConfig, **kwargs):
         self.log_performance(performance[0, :])
         if hasattr(config, "delete_below_top_k") and config.delete_below_top_k >= 0:
             self.delete_below_top_k_models(
-                performance, config.delete_below_top_k, config.path.output_model_dir
+                performance, config.delete_below_top_k, config.path.sweep_results_dir
             )
         else:
             logger.info(
                 "All models were saved. To automatically delete models, set delete_below_top_k in config."
             )
+        best_trial_dir = Path(config.path.sweep_results_dir) / performance["trial_name"].cast(pl.String)[0]
+        output_best_trial_dir = Path(config.path.best_trial_dir)
+        shutil.copytree(best_trial_dir, output_best_trial_dir)
+        performance.write_parquet(config.time_output_model_dir / "sweep_results_summary.parquet")
 
         return performance.head(1)
 
     def log_performance(self, best_model_performance):
         """logger.info performance of the best model with nice formatting."""
-        best_model = best_model_performance["model_fp"][0]
+        best_model = best_model_performance["trial_name"][0]
         tuning_auc = best_model_performance["tuning_auc"][0]
         test_auc = best_model_performance["test_auc"][0]
         log_performance_message = [
@@ -44,11 +49,11 @@ def log_performance(self, best_model_performance):
         ]
         logger.info("\n".join(log_performance_message))
 
-    def delete_below_top_k_models(self, performance, k, model_dir):
+    def delete_below_top_k_models(self, performance, k, sweep_results_dir):
         """Save only top k models from the model directory and delete all other files.
 
         Args:
-            performance: DataFrame containing model_fp and performance metrics.
+            performance: DataFrame containing trial_name and performance metrics.
             k: Number of top models to save.
             model_dir: Directory containing models.
 
@@ -57,14 +62,14 @@ def delete_below_top_k_models(self, performance, k, model_dir):
             >>> import json
             >>> performance = pl.DataFrame(
             ...     {
-            ...         "model_fp": ["model1", "model2", "model3", "model4"],
+            ...         "trial_name": ["model1", "model2", "model3", "model4"],
             ...         "tuning_auc": [0.9, 0.8, 0.7, 0.6],
             ...         "test_auc": [0.9, 0.8, 0.7, 0.6],
             ...     }
             ... )
             >>> k = 2
             >>> with tempfile.TemporaryDirectory() as model_dir:
-            ...     for model in performance["model_fp"]:
+            ...     for model in performance["trial_name"]:
             ...         with open(Path(model_dir) / f"{model}.json", 'w') as f:
             ...             json.dump({"model_name": model, "content": "dummy data"}, f)
             ...     cb = EvaluationCallback()
@@ -74,7 +79,8 @@ def delete_below_top_k_models(self, performance, k, model_dir):
             ['model1', 'model2']
         """
         logger.info(f"Deleting all models except top {k} models.")
-        top_k_models = performance.head(k)["model_fp"].to_list()
-        for model_fp in Path(model_dir).iterdir():
-            if model_fp.is_file() and model_fp.suffix != ".log" and str(model_fp.stem) not in top_k_models:
-                model_fp.unlink()
+        top_k_models = performance.head(k)["trial_name"].cast(pl.String).to_list()
+        logger.debug(f"Top {k} models: {top_k_models}")
+        for trial_dir in Path(sweep_results_dir).iterdir():
+            if trial_dir.stem not in top_k_models:
+                shutil.rmtree(trial_dir)
diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py
@@ -67,13 +67,13 @@ def main(cfg: DictConfig) -> float:
     held_out_dataset = ag.TabularDataset(held_out_df)
 
     # train model with AutoGluon
-    log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.config_log_stem}_log.txt"
+    log_filepath = Path(cfg.path.sweep_results_dir) / f"{cfg.path.config_log_stem}_log.txt"
 
     predictor = ag.TabularPredictor(
         label=cfg.task_name,
         log_to_file=True,
         log_file_path=str(log_filepath.resolve()),
-        path=cfg.output_model_dir,
+        path=cfg.time_output_model_dir,
     ).fit(train_data=train_dataset, tuning_data=tuning_dataset)
 
     # predict
@@ -83,11 +83,13 @@ def main(cfg: DictConfig) -> float:
     score = predictor.evaluate(held_out_dataset)
     logger.info("Test score:", score)
 
-    model_performance_log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.performance_log_stem}.json"
+    model_performance_log_filepath = (
+        Path(cfg.path.sweep_results_dir) / f"{cfg.path.performance_log_stem}.json"
+    )
     model_performance_log_filepath.parent.mkdir(parents=True, exist_ok=True)
     # store results
     performance_dict = {
-        "output_model_dir": cfg.path.output_model_dir,
+        "output_model_dir": cfg.path.time_output_model_dir,
         "tabularization": OmegaConf.to_container(cfg.tabularization),
         "model_launcher": OmegaConf.to_container(cfg.model_launcher),
         "score": score,

diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py
@@ -1,14 +1,14 @@
-import time
+import json
 from importlib.resources import files
 from pathlib import Path
 
 import hydra
 from loguru import logger
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 
 from MEDS_tabular_automl.base_model import BaseModel
 
-from ..utils import hydra_loguru_init, log_to_logfile, stage_init
+from ..utils import hydra_loguru_init, stage_init
 
 config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml")
 if not config_yaml.is_file():
@@ -43,17 +43,31 @@ def main(cfg: DictConfig) -> float:
     model_launcher.train()
     auc = model_launcher.evaluate()
 
-    # save model
-    output_model_dir = Path(cfg.output_model_dir)
+    # Make output model directory
     path_cfg = model_launcher.cfg.path
-    model_filename = f"{path_cfg.model_file_stem}_{auc:.4f}_{time.time()}{path_cfg.model_file_extension}"
-    output_fp = output_model_dir / model_filename
-    output_model_dir.parent.mkdir(parents=True, exist_ok=True)
-
-    # log to logfile
-    log_to_logfile(model_launcher, cfg, output_fp.stem)
+    model_filename = f"{path_cfg.model_file_stem}{path_cfg.model_file_extension}"
+    model_config_hash = abs(hash(json.dumps(OmegaConf.to_container(cfg), sort_keys=True)))
+    trial_output_dir = Path(path_cfg.sweep_results_dir) / str(model_config_hash)
+    trial_output_dir.mkdir(parents=True, exist_ok=True)
 
-    model_launcher.save_model(output_fp)
+    # save model
+    model_launcher.save_model(trial_output_dir / model_filename)
+
+    # save model config
+    config_fp = trial_output_dir / f"{cfg.path.config_log_stem}.log"
+    with open(config_fp, "w") as f:
+        f.write(OmegaConf.to_yaml(cfg))
+
+    # save model performance
+    model_performance_fp = trial_output_dir / f"{cfg.path.performance_log_stem}.log"
+    with open(model_performance_fp, "w") as f:
+        f.write("trial_name,tuning_auc,test_auc\n")
+        f.write(
+            f"{trial_output_dir.stem},{model_launcher.evaluate()},"
+            f"{model_launcher.evaluate(split='held_out')}\n"
+        )
+
+    logger.debug(f"Model config and performance logged to {config_fp} and {model_performance_fp}")
     return auc
 
 

diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py
@@ -175,13 +175,12 @@ def evaluate(self, split: str = "tuning") -> float:
             raise ValueError("Predictions or true labels are empty.")
         return roc_auc_score(y_true, y_pred)
 
-    def save_model(self, output_fp: str):
+    def save_model(self, output_fp: Path):
         """Saves the model to the specified file path.
 
         Args:
             output_fp: The file path to save the model to.
         """
-        output_fp = Path(output_fp)
         # check if model has save method
         if not hasattr(self.model, "save_model"):
             logger.info(f"Model {self.model.__class__.__name__} does not have a save_model method.")

diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py
@@ -418,33 +418,6 @@ def get_shard_prefix(base_path: Path, fp: Path) -> str:
     return str(relative_parent / file_name)
 
 
-def log_to_logfile(model, cfg, output_fp):
-    """Log model hyperparameters and performance to two log files.
-
-    Args:
-        model: The model to log.
-        cfg: The configuration dictionary.
-        output_fp: The relative output file path.
-    """
-    log_fp = Path(cfg.path.model_log_dir)
-
-    # make a folder to log everything for this model
-    out_fp = log_fp / output_fp
-    out_fp.mkdir(parents=True, exist_ok=True)
-
-    # config as a json
-    config_fp = out_fp / f"{cfg.path.config_log_stem}.log"
-    with open(config_fp, "w") as f:
-        f.write(OmegaConf.to_yaml(cfg))
-
-    model_performance_fp = out_fp / f"{cfg.path.performance_log_stem}.log"
-    with open(model_performance_fp, "w") as f:
-        f.write("model_fp,tuning_auc,test_auc\n")
-        f.write(f"{output_fp},{model.evaluate()},{model.evaluate(split='held_out')}\n")
-
-    logger.debug(f"Model config and performance logged to {config_fp} and {model_performance_fp}")
-
-
 def current_script_name() -> str:
     """Returns the name of the module that called this function."""
 

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -2,6 +2,7 @@
 
 root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
 
+import glob
 import json
 import shutil
 import subprocess
@@ -291,8 +292,8 @@ def test_integration(tmp_path):
             "output_model_dir": str(output_model_dir.resolve()),
             "model_launcher": model,
             "path.model_file_stem": model,
-            "hydra.sweeper.n_trials": 2,
-            "delete_below_top_k": 1,
+            "hydra.sweeper.n_trials": 3,
+            "delete_below_top_k": 2,
             "data_loading_params.keep_data_in_memory": True,
         }
         overrides = [f"tabularization.aggs={stdout_agg.strip()}"]
@@ -305,9 +306,11 @@ def test_integration(tmp_path):
         stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}")
         assert "Performance of best model:" in stderr
         if model == "xgboost":
-            assert len(list_subdir_files(str(output_model_dir.resolve()), "json")) == 1
+            assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.json"))) == 2
+            assert len(glob.glob(str(output_model_dir / "*/best_trial/*.json"))) == 1
         else:
-            assert len(list_subdir_files(str(output_model_dir.resolve()), "pkl")) == 1
+            assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.pkl"))) == 2
+            assert len(glob.glob(str(output_model_dir / "*/best_trial/*.pkl"))) == 1
         shutil.rmtree(output_model_dir)
 
     for model in [
@@ -322,8 +325,8 @@ def test_integration(tmp_path):
             "output_model_dir": str(output_model_dir.resolve()),
             "model_launcher": model,
             "path.model_file_stem": model,
-            "hydra.sweeper.n_trials": 2,
-            "delete_below_top_k": 1,
+            "hydra.sweeper.n_trials": 3,
+            "delete_below_top_k": 2,
             "data_loading_params.keep_data_in_memory": False,
         }
         overrides = [f"tabularization.aggs={stdout_agg.strip()}"]
@@ -336,7 +339,9 @@ def test_integration(tmp_path):
         stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}")
         assert "Performance of best model:" in stderr
         if model == "xgboost":
-            assert len(list_subdir_files(str(output_model_dir.resolve()), "json")) == 1
+            assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.json"))) == 2
+            assert len(glob.glob(str(output_model_dir / "*/best_trial/*.json"))) == 1
         else:
-            assert len(list_subdir_files(str(output_model_dir.resolve()), "pkl")) == 1
+            assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.pkl"))) == 2
+            assert len(glob.glob(str(output_model_dir / "*/best_trial/*.pkl"))) == 1
         shutil.rmtree(output_model_dir)