Skip to content

Commit

Permalink
update xgboost sweep
Browse files Browse the repository at this point in the history
  • Loading branch information
teyaberg committed Jun 9, 2024
1 parent 125e3e7 commit 61e39df
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 95 deletions.
64 changes: 19 additions & 45 deletions src/MEDS_tabular_automl/configs/launch_xgboost.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
defaults:
- default
- tabularization: default
- override hydra/sweeper: optuna
- override hydra/sweeper/sampler: tpe
- _self_

task_name: task
Expand Down Expand Up @@ -28,52 +30,24 @@ model_params:
keep_data_in_memory: True
binarize_task: True

# Define search space for Optuna
optuna:
study_name: xgboost_sweep_${now:%Y-%m-%d_%H-%M-%S}
storage: null
load_if_exists: False
direction: minimize
sampler: null
pruner: null
hydra:
verbose: False
sweep:
dir: ${output_dir}/.logs/
run:
dir: ${output_dir}/.logs/

n_trials: 10
n_jobs: 1
show_progress_bar: False
# Optuna Sweeper
sweeper:
sampler:
seed: 1
study_name: null #study_${now:%Y-%m-%d_%H-%M-%S}
storage: null
direction: minimize
n_trials: 10

params:
suggest_categorical:
window_sizes: ${generate_permutations:${tabularization.window_sizes}}
aggs: ${generate_permutations:${tabularization.aggs}}
suggest_float:
eta:
low: .001
high: 1
log: True
lambda:
low: .001
high: 1
log: True
alpha:
low: .001
high: 1
log: True
subsample:
low: 0.5
high: 1
min_child_weight:
low: 1e-2
high: 100
suggest_int:
num_boost_round:
low: 10
high: 1000
max_depth:
low: 2
high: 16
min_code_inclusion_frequency:
low: 10
high: 1_000_000
log: True
# Define search space for Optuna
params:
tabularization.window_sizes: choice([30d], [30d, 365d], [365d, full])

name: launch_xgboost
36 changes: 14 additions & 22 deletions src/MEDS_tabular_automl/scripts/launch_xgboost.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections.abc import Callable, Mapping
from datetime import datetime
from importlib.resources import files
from pathlib import Path

Expand All @@ -12,7 +13,7 @@
from omegaconf import DictConfig, OmegaConf
from sklearn.metrics import roc_auc_score

from MEDS_tabular_automl.describe_codes import get_feature_columns, get_feature_freqs
from MEDS_tabular_automl.describe_codes import get_feature_columns
from MEDS_tabular_automl.file_name import get_model_files, list_subdir_files
from MEDS_tabular_automl.utils import get_feature_indices, hydra_loguru_init

Expand Down Expand Up @@ -188,18 +189,8 @@ def _get_dynamic_shard_by_index(self, idx: int) -> sp.csc_matrix:

dynamic_cscs = [self._load_dynamic_shard_from_file(file, idx) for file in files]

fn_name = "_get_dynamic_shard_by_index"
hstack_key = f"{fn_name}/hstack"
self._register_start(key=hstack_key)

combined_csc = sp.hstack(dynamic_cscs, format="csc") # TODO: check this
# self._register_end(key=hstack_key)
# # Filter Rows
# valid_indices = self.valid_event_ids[shard_name]
# filter_key = f"{fn_name}/filter"
# self._register_start(key=filter_key)
# out = combined_csc[valid_indices, :]
# self._register_end(key=filter_key)
combined_csc = sp.hstack(dynamic_cscs, format="csc")

return combined_csc

@TimeableMixin.TimeAs
Expand Down Expand Up @@ -388,30 +379,31 @@ def main(cfg: DictConfig) -> float:
Returns:
- float: Evaluation result.
"""

print(OmegaConf.to_yaml(cfg))
if not cfg.loguru_init:
hydra_loguru_init()

model = XGBoostModel(cfg)
model.train()
auc = model.evaluate()
logger.info(f"AUC: {auc}")

print(
"Time Profiling for window sizes ",
f"{cfg.tabularization.window_sizes} and min ",
"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
)
print("Train Time: \n", model._profile_durations())
print("Train Iterator Time: \n", model.itrain._profile_durations())
print("Tuning Iterator Time: \n", model.ituning._profile_durations())
print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())
# print("Train Iterator Time: \n", model.itrain._profile_durations())
# print("Tuning Iterator Time: \n", model.ituning._profile_durations())
# print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())

# save model
save_dir = Path(cfg.output_dir)
save_dir.mkdir(parents=True, exist_ok=True)

logger.info(f"Saving the model to directory: {save_dir}")
model.model.save_model(save_dir / "model.json")
auc = model.evaluate()
logger.info(f"AUC: {auc}")
model_time = datetime.now().strftime("%H%M%S%f")
model.model.save_model(save_dir / f"{auc:.4f}_model_{model_time}.json")
return auc


Expand Down
12 changes: 7 additions & 5 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,14 +275,16 @@ def test_integration():
)
output_files = list(Path(cfg.output_dir).parent.glob("**/*.json"))
assert len(output_files) == 1
assert output_files[0].stem == "model"
# assert output_files[0].stem == '0.6667_model'

stderr, stdout = run_command(
"meds-tab-xgboost-sweep",
[],
"meds-tab-xgboost",
[
"--multirun",
],
xgboost_config_kwargs,
"xgboost-sweep",
)
output_files = list(Path(cfg.output_dir).parent.glob("**/*.json"))
assert len(output_files) == 2
assert output_files[0].stem == "model"
assert len(output_files) == 11
# assert output_files[0].stem == "model"
45 changes: 22 additions & 23 deletions tests/test_tabularize.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
cache_task,
describe_codes,
launch_xgboost,
sweep_xgboost,
tabularize_static,
tabularize_time_series,
)
Expand Down Expand Up @@ -395,25 +394,25 @@ def test_tabularize():
assert output_files[0] == Path(cfg.output_dir) / "model.json"
os.remove(Path(cfg.output_dir) / "model.json")

xgboost_config_kwargs = {
"MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()),
"do_overwrite": False,
"seed": 1,
"hydra.verbose": True,
"tqdm": False,
"loguru_init": True,
"tabularization.min_code_inclusion_frequency": 1,
"tabularization.aggs": "[static/present,static/first,code/count,value/sum]",
"tabularization.window_sizes": "[30d,365d,full]",
}

with initialize(
version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
): # path to config.yaml
overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()]
cfg = compose(config_name="launch_xgboost", overrides=overrides) # config.yaml

sweep_xgboost.main(cfg)
output_files = list(Path(cfg.output_dir).glob("**/*.json"))
assert len(output_files) == 1
assert output_files[0] == Path(cfg.output_dir) / "model.json"
# xgboost_config_kwargs = {
# "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()),
# "do_overwrite": False,
# "seed": 1,
# "hydra.verbose": True,
# "tqdm": False,
# "loguru_init": True,
# "tabularization.min_code_inclusion_frequency": 1,
# "tabularization.aggs": "[static/present,static/first,code/count,value/sum]",
# "tabularization.window_sizes": "[30d,365d,full]",
# }

# with initialize(
# version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
# ): # path to config.yaml
# overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()]
# cfg = compose(config_name="launch_xgboost", overrides=overrides) # config.yaml

# launch_xgboost.main(cfg)
# output_files = list(Path(cfg.output_dir).glob("**/*.json"))
# assert len(output_files) == 1
# assert output_files[0] == Path(cfg.output_dir) / "model.json"

0 comments on commit 61e39df

Please sign in to comment.