Skip to content

Commit

Permalink
Merge pull request #18 from mmcdermott/clean
Browse files Browse the repository at this point in the history
Incorporating XGBoost fixes
  • Loading branch information
mmcdermott authored Jun 13, 2024
2 parents 530ebff + 0d53128 commit d8e9de6
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 27 deletions.
16 changes: 13 additions & 3 deletions src/MEDS_tabular_automl/configs/launch_xgboost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@ defaults:
- tabularization: default
- override hydra/sweeper: optuna
- override hydra/sweeper/sampler: tpe
- override hydra/launcher: joblib
- _self_

task_name: task

# Task cached data dir
input_dir: ${MEDS_cohort_dir}/${task_name}/task_cache
# Directory with task labels
input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels
input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels/final_cohort
# Where to output the model and cached data
output_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S}
output_filepath: ${output_dir}/model_metadata.parquet
Expand Down Expand Up @@ -44,10 +45,19 @@ hydra:
study_name: null #study_${now:%Y-%m-%d_%H-%M-%S}
storage: null
direction: minimize
n_trials: 10
n_trials: 250
n_jobs: 25

# Define search space for Optuna
params:
tabularization.window_sizes: choice([30d], [30d, 365d], [365d, full])
+model_params.model.eta: tag(log, interval(0.001, 1))
+model_params.model.lambda: tag(log, interval(0.001, 1))
+model_params.model.alpha: tag(log, interval(0.001, 1))
+model_params.model.subsample: interval(0.5, 1)
+model_params.model.min_child_weight: interval(1e-2, 100)
model_params.num_boost_round: range(100, 1000)
model_params.early_stopping_rounds: range(1, 10)
+model_params.model.max_depth: range(2, 16)
tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))

name: launch_xgboost
2 changes: 1 addition & 1 deletion src/MEDS_tabular_automl/configs/task_specific_caching.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ task_name: task
# Tabularized Data
input_dir: ${MEDS_cohort_dir}/tabularize
# Where the labels are stored, with columns patient_id, timestamp, label
input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels
input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels/final_cohort
# Where to output the task specific tabularized data
output_dir: ${MEDS_cohort_dir}/${task_name}/task_cache

Expand Down
51 changes: 28 additions & 23 deletions src/MEDS_tabular_automl/scripts/launch_xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,8 @@ def collect_in_memory(self) -> tuple[sp.csc_matrix, np.ndarray]:
X_, y_ = self._get_shard_by_index(i)
X.append(X_)
y.append(y_)

if len(X) == 0 or len(y) == 0:
raise ValueError("No data found in the shards or labels. Please check input files.")
X = sp.vstack(X)
y = np.concatenate(y, axis=0)
return X, y
Expand Down Expand Up @@ -315,6 +316,7 @@ def _train(self):
early_stopping_rounds=self.cfg.model_params.early_stopping_rounds,
# nthreads=self.cfg.nthreads,
evals=[(self.dtrain, "train"), (self.dtuning, "tuning")],
verbose_eval=0,
)

@TimeableMixin.TimeAs
Expand Down Expand Up @@ -380,30 +382,33 @@ def main(cfg: DictConfig) -> float:
- float: Evaluation result.
"""

print(OmegaConf.to_yaml(cfg))
# print(OmegaConf.to_yaml(cfg))
if not cfg.loguru_init:
hydra_loguru_init()

model = XGBoostModel(cfg)
model.train()
auc = model.evaluate()
logger.info(f"AUC: {auc}")

print(
"Time Profiling for window sizes ",
f"{cfg.tabularization.window_sizes} and min ",
f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
)
print("Train Time: \n", model._profile_durations())
# print("Train Iterator Time: \n", model.itrain._profile_durations())
# print("Tuning Iterator Time: \n", model.ituning._profile_durations())
# print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())

# save model
save_dir = Path(cfg.output_dir)
save_dir.mkdir(parents=True, exist_ok=True)
model_time = datetime.now().strftime("%H%M%S%f")
model.model.save_model(save_dir / f"{auc:.4f}_model_{model_time}.json")
try:
model = XGBoostModel(cfg)
model.train()
auc = model.evaluate()
logger.info(f"AUC: {auc}")

# print(
# "Time Profiling for window sizes ",
# f"{cfg.tabularization.window_sizes} and min ",
# f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
# )
# print("Train Time: \n", model._profile_durations())
# print("Train Iterator Time: \n", model.itrain._profile_durations())
# print("Tuning Iterator Time: \n", model.ituning._profile_durations())
# print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())

# save model
save_dir = Path(cfg.output_dir)
save_dir.mkdir(parents=True, exist_ok=True)
model_time = datetime.now().strftime("%H%M%S%f")
model.model.save_model(save_dir / f"{auc:.4f}_model_{model_time}.json")
except Exception as e:
logger.error(f"Error occurred: {e}")
auc = 0.0
return auc


Expand Down

0 comments on commit d8e9de6

Please sign in to comment.