Skip to content

Commit

Permalink
simplified pathing for results and evaluation callback
Browse files Browse the repository at this point in the history
  • Loading branch information
Oufattole committed Sep 10, 2024
1 parent 9e6d99a commit 8316365
Show file tree
Hide file tree
Showing 15 changed files with 96 additions and 86 deletions.
6 changes: 4 additions & 2 deletions src/MEDS_tabular_automl/configs/launch_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,17 @@ input_label_cache_dir: ${output_dir}/${task_name}/labels
# Where to output the model and cached data
output_model_dir: ???

time_output_model_dir: ${output_model_dir}/${now:%Y-%m-%d_%H-%M-%S}

delete_below_top_k: -1

name: launch_model

hydra:
sweep:
dir: ${output_model_dir}/sweeps/${now:%Y-%m-%d-%H-%M-%S}/
dir: ${time_output_model_dir}/hydra/
subdir: "1"
run:
dir: ${path.model_log_dir}
dir: ${path.sweep_results_dir}
sweeper:
direction: "maximize"
3 changes: 3 additions & 0 deletions src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
defaults:
- default
- _self_

path:
model_file_stem: "autogluon"
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ model_launcher:
p: 2
metric: "minkowski"

path:
model_file_extension: .pkl
path:
model_file_extension: .pkl
model_file_stem: "knn_classifier"

hydra:
sweeper:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ model_launcher:
solver: "lbfgs"
max_iter: 100

path:
model_file_extension: .pkl
path:
model_file_extension: .pkl
model_file_stem: "logistic_regression"

hydra:
sweeper:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
input_tabularized_cache_dir: ${input_tabularized_cache_dir}
input_label_cache_dir: ${input_label_cache_dir}
output_model_dir: ${output_model_dir}
model_file_stem: model
model_file_extension: .json
log_dir: ${log_dir}
cache_dir: ${cache_dir}
model_log_dir: ${output_model_dir}/.logs/
sweep_results_dir: ${time_output_model_dir}/sweep_results/
best_trial_dir: ${time_output_model_dir}/best_trial/
performance_log_stem: performance
config_log_stem: config
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ model_launcher:
min_impurity_decrease: 0.0
bootstrap: True

path:
model_file_extension: .pkl
path:
model_file_extension: .pkl
model_file_stem: "random_forest_classifier"

hydra:
sweeper:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ model_launcher:
_target_: sklearn.linear_model.SGDClassifier
loss: log_loss

path:
model_file_extension: .pkl
path:
model_file_extension: .pkl
model_file_stem: "sgd_classifier"

hydra:
sweeper:
Expand Down
3 changes: 3 additions & 0 deletions src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ model_launcher:
num_boost_round: 1000
early_stopping_rounds: 5

path:
model_file_stem: "xgboost"

hydra:
sweeper:
params:
Expand Down
28 changes: 17 additions & 11 deletions src/MEDS_tabular_automl/evaluation_callback.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import shutil
from pathlib import Path

import polars as pl
Expand All @@ -9,7 +10,7 @@
class EvaluationCallback(Callback):
def on_multirun_end(self, config: DictConfig, **kwargs):
"""Find best model based on log files and logger.info its performance and hyperparameters."""
log_fp = Path(config.path.model_log_dir)
log_fp = Path(config.path.sweep_results_dir)

try:
performance = pl.read_csv(log_fp / f"*/*{config.path.performance_log_stem}.log")
Expand All @@ -22,18 +23,22 @@ def on_multirun_end(self, config: DictConfig, **kwargs):
self.log_performance(performance[0, :])
if hasattr(config, "delete_below_top_k") and config.delete_below_top_k >= 0:
self.delete_below_top_k_models(
performance, config.delete_below_top_k, config.path.output_model_dir
performance, config.delete_below_top_k, config.path.sweep_results_dir
)
else:
logger.info(
"All models were saved. To automatically delete models, set delete_below_top_k in config."
)
best_trial_dir = Path(config.path.sweep_results_dir) / performance["trial_name"].cast(pl.String)[0]
output_best_trial_dir = Path(config.path.best_trial_dir)
shutil.copytree(best_trial_dir, output_best_trial_dir)
performance.write_parquet(config.time_output_model_dir / "sweep_results_summary.parquet")

return performance.head(1)

def log_performance(self, best_model_performance):
"""logger.info performance of the best model with nice formatting."""
best_model = best_model_performance["model_fp"][0]
best_model = best_model_performance["trial_name"][0]
tuning_auc = best_model_performance["tuning_auc"][0]
test_auc = best_model_performance["test_auc"][0]
log_performance_message = [
Expand All @@ -44,11 +49,11 @@ def log_performance(self, best_model_performance):
]
logger.info("\n".join(log_performance_message))

def delete_below_top_k_models(self, performance, k, model_dir):
def delete_below_top_k_models(self, performance, k, sweep_results_dir):
"""Save only top k models from the model directory and delete all other files.
Args:
performance: DataFrame containing model_fp and performance metrics.
performance: DataFrame containing trial_name and performance metrics.
k: Number of top models to save.
model_dir: Directory containing models.
Expand All @@ -57,14 +62,14 @@ def delete_below_top_k_models(self, performance, k, model_dir):
>>> import json
>>> performance = pl.DataFrame(
... {
... "model_fp": ["model1", "model2", "model3", "model4"],
... "trial_name": ["model1", "model2", "model3", "model4"],
... "tuning_auc": [0.9, 0.8, 0.7, 0.6],
... "test_auc": [0.9, 0.8, 0.7, 0.6],
... }
... )
>>> k = 2
>>> with tempfile.TemporaryDirectory() as model_dir:
... for model in performance["model_fp"]:
... for model in performance["trial_name"]:
... with open(Path(model_dir) / f"{model}.json", 'w') as f:
... json.dump({"model_name": model, "content": "dummy data"}, f)
... cb = EvaluationCallback()
Expand All @@ -74,7 +79,8 @@ def delete_below_top_k_models(self, performance, k, model_dir):
['model1', 'model2']
"""
logger.info(f"Deleting all models except top {k} models.")
top_k_models = performance.head(k)["model_fp"].to_list()
for model_fp in Path(model_dir).iterdir():
if model_fp.is_file() and model_fp.suffix != ".log" and str(model_fp.stem) not in top_k_models:
model_fp.unlink()
top_k_models = performance.head(k)["trial_name"].cast(pl.String).to_list()
logger.debug(f"Top {k} models: {top_k_models}")
for trial_dir in Path(sweep_results_dir).iterdir():
if trial_dir.stem not in top_k_models:
shutil.rmtree(trial_dir)
10 changes: 6 additions & 4 deletions src/MEDS_tabular_automl/scripts/launch_autogluon.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,13 @@ def main(cfg: DictConfig) -> float:
held_out_dataset = ag.TabularDataset(held_out_df)

# train model with AutoGluon
log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.config_log_stem}_log.txt"
log_filepath = Path(cfg.path.sweep_results_dir) / f"{cfg.path.config_log_stem}_log.txt"

predictor = ag.TabularPredictor(
label=cfg.task_name,
log_to_file=True,
log_file_path=str(log_filepath.resolve()),
path=cfg.output_model_dir,
path=cfg.time_output_model_dir,
).fit(train_data=train_dataset, tuning_data=tuning_dataset)

# predict
Expand All @@ -83,11 +83,13 @@ def main(cfg: DictConfig) -> float:
score = predictor.evaluate(held_out_dataset)
logger.info("Test score:", score)

model_performance_log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.performance_log_stem}.json"
model_performance_log_filepath = (
Path(cfg.path.sweep_results_dir) / f"{cfg.path.performance_log_stem}.json"
)
model_performance_log_filepath.parent.mkdir(parents=True, exist_ok=True)
# store results
performance_dict = {
"output_model_dir": cfg.path.output_model_dir,
"output_model_dir": cfg.path.time_output_model_dir,
"tabularization": OmegaConf.to_container(cfg.tabularization),
"model_launcher": OmegaConf.to_container(cfg.model_launcher),
"score": score,
Expand Down
38 changes: 26 additions & 12 deletions src/MEDS_tabular_automl/scripts/launch_model.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import time
import json
from importlib.resources import files
from pathlib import Path

import hydra
from loguru import logger
from omegaconf import DictConfig
from omegaconf import DictConfig, OmegaConf

from MEDS_tabular_automl.base_model import BaseModel

from ..utils import hydra_loguru_init, log_to_logfile, stage_init
from ..utils import hydra_loguru_init, stage_init

config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml")
if not config_yaml.is_file():
Expand Down Expand Up @@ -43,17 +43,31 @@ def main(cfg: DictConfig) -> float:
model_launcher.train()
auc = model_launcher.evaluate()

# save model
output_model_dir = Path(cfg.output_model_dir)
# Make output model directory
path_cfg = model_launcher.cfg.path
model_filename = f"{path_cfg.model_file_stem}_{auc:.4f}_{time.time()}{path_cfg.model_file_extension}"
output_fp = output_model_dir / model_filename
output_model_dir.parent.mkdir(parents=True, exist_ok=True)

# log to logfile
log_to_logfile(model_launcher, cfg, output_fp.stem)
model_filename = f"{path_cfg.model_file_stem}{path_cfg.model_file_extension}"
model_config_hash = abs(hash(json.dumps(OmegaConf.to_container(cfg), sort_keys=True)))
trial_output_dir = Path(path_cfg.sweep_results_dir) / str(model_config_hash)
trial_output_dir.mkdir(parents=True, exist_ok=True)

model_launcher.save_model(output_fp)
# save model
model_launcher.save_model(trial_output_dir / model_filename)

# save model config
config_fp = trial_output_dir / f"{cfg.path.config_log_stem}.log"
with open(config_fp, "w") as f:
f.write(OmegaConf.to_yaml(cfg))

# save model performance
model_performance_fp = trial_output_dir / f"{cfg.path.performance_log_stem}.log"
with open(model_performance_fp, "w") as f:
f.write("trial_name,tuning_auc,test_auc\n")
f.write(
f"{trial_output_dir.stem},{model_launcher.evaluate()},"
f"{model_launcher.evaluate(split='held_out')}\n"
)

logger.debug(f"Model config and performance logged to {config_fp} and {model_performance_fp}")
return auc


Expand Down
3 changes: 1 addition & 2 deletions src/MEDS_tabular_automl/sklearn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,13 +175,12 @@ def evaluate(self, split: str = "tuning") -> float:
raise ValueError("Predictions or true labels are empty.")
return roc_auc_score(y_true, y_pred)

def save_model(self, output_fp: str):
def save_model(self, output_fp: Path):
"""Saves the model to the specified file path.
Args:
output_fp: The file path to save the model to.
"""
output_fp = Path(output_fp)
# check if model has save method
if not hasattr(self.model, "save_model"):
logger.info(f"Model {self.model.__class__.__name__} does not have a save_model method.")
Expand Down
27 changes: 0 additions & 27 deletions src/MEDS_tabular_automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,33 +418,6 @@ def get_shard_prefix(base_path: Path, fp: Path) -> str:
return str(relative_parent / file_name)


def log_to_logfile(model, cfg, output_fp):
"""Log model hyperparameters and performance to two log files.
Args:
model: The model to log.
cfg: The configuration dictionary.
output_fp: The relative output file path.
"""
log_fp = Path(cfg.path.model_log_dir)

# make a folder to log everything for this model
out_fp = log_fp / output_fp
out_fp.mkdir(parents=True, exist_ok=True)

# config as a json
config_fp = out_fp / f"{cfg.path.config_log_stem}.log"
with open(config_fp, "w") as f:
f.write(OmegaConf.to_yaml(cfg))

model_performance_fp = out_fp / f"{cfg.path.performance_log_stem}.log"
with open(model_performance_fp, "w") as f:
f.write("model_fp,tuning_auc,test_auc\n")
f.write(f"{output_fp},{model.evaluate()},{model.evaluate(split='held_out')}\n")

logger.debug(f"Model config and performance logged to {config_fp} and {model_performance_fp}")


def current_script_name() -> str:
"""Returns the name of the module that called this function."""

Expand Down
21 changes: 13 additions & 8 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)

import glob
import json
import shutil
import subprocess
Expand Down Expand Up @@ -291,8 +292,8 @@ def test_integration(tmp_path):
"output_model_dir": str(output_model_dir.resolve()),
"model_launcher": model,
"path.model_file_stem": model,
"hydra.sweeper.n_trials": 2,
"delete_below_top_k": 1,
"hydra.sweeper.n_trials": 3,
"delete_below_top_k": 2,
"data_loading_params.keep_data_in_memory": True,
}
overrides = [f"tabularization.aggs={stdout_agg.strip()}"]
Expand All @@ -305,9 +306,11 @@ def test_integration(tmp_path):
stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}")
assert "Performance of best model:" in stderr
if model == "xgboost":
assert len(list_subdir_files(str(output_model_dir.resolve()), "json")) == 1
assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.json"))) == 2
assert len(glob.glob(str(output_model_dir / "*/best_trial/*.json"))) == 1
else:
assert len(list_subdir_files(str(output_model_dir.resolve()), "pkl")) == 1
assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.pkl"))) == 2
assert len(glob.glob(str(output_model_dir / "*/best_trial/*.pkl"))) == 1
shutil.rmtree(output_model_dir)

for model in [
Expand All @@ -322,8 +325,8 @@ def test_integration(tmp_path):
"output_model_dir": str(output_model_dir.resolve()),
"model_launcher": model,
"path.model_file_stem": model,
"hydra.sweeper.n_trials": 2,
"delete_below_top_k": 1,
"hydra.sweeper.n_trials": 3,
"delete_below_top_k": 2,
"data_loading_params.keep_data_in_memory": False,
}
overrides = [f"tabularization.aggs={stdout_agg.strip()}"]
Expand All @@ -336,7 +339,9 @@ def test_integration(tmp_path):
stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}")
assert "Performance of best model:" in stderr
if model == "xgboost":
assert len(list_subdir_files(str(output_model_dir.resolve()), "json")) == 1
assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.json"))) == 2
assert len(glob.glob(str(output_model_dir / "*/best_trial/*.json"))) == 1
else:
assert len(list_subdir_files(str(output_model_dir.resolve()), "pkl")) == 1
assert len(glob.glob(str(output_model_dir / "*/sweep_results/**/*.pkl"))) == 2
assert len(glob.glob(str(output_model_dir / "*/best_trial/*.pkl"))) == 1
shutil.rmtree(output_model_dir)
Loading

0 comments on commit 8316365

Please sign in to comment.