Merge pull request #18 from mmcdermott/clean

Incorporating XGBoost fixes
mmcdermott · Jun 13, 2024 · d8e9de6 · d8e9de6
2 parents 530ebff + 0d53128
commit d8e9de6
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 27 deletions.
diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml
@@ -3,14 +3,15 @@ defaults:
   - tabularization: default
   - override hydra/sweeper: optuna
   - override hydra/sweeper/sampler: tpe
+  - override hydra/launcher: joblib
   - _self_
 
 task_name: task
 
 # Task cached data dir
 input_dir: ${MEDS_cohort_dir}/${task_name}/task_cache
 # Directory with task labels
-input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels
+input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels/final_cohort
 # Where to output the model and cached data
 output_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S}
 output_filepath: ${output_dir}/model_metadata.parquet
@@ -44,10 +45,19 @@ hydra:
     study_name: null #study_${now:%Y-%m-%d_%H-%M-%S}
     storage: null
     direction: minimize
-    n_trials: 10
+    n_trials: 250
+    n_jobs: 25
 
     # Define search space for Optuna
     params:
-      tabularization.window_sizes: choice([30d], [30d, 365d], [365d, full])
+      +model_params.model.eta: tag(log, interval(0.001, 1))
+      +model_params.model.lambda: tag(log, interval(0.001, 1))
+      +model_params.model.alpha: tag(log, interval(0.001, 1))
+      +model_params.model.subsample: interval(0.5, 1)
+      +model_params.model.min_child_weight: interval(1e-2, 100)
+      model_params.num_boost_round: range(100, 1000)
+      model_params.early_stopping_rounds: range(1, 10)
+      +model_params.model.max_depth: range(2, 16)
+      tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
 
 name: launch_xgboost
diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml
@@ -7,7 +7,7 @@ task_name: task
 # Tabularized Data
 input_dir: ${MEDS_cohort_dir}/tabularize
 # Where the labels are stored, with columns patient_id, timestamp, label
-input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels
+input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels/final_cohort
 # Where to output the task specific tabularized data
 output_dir: ${MEDS_cohort_dir}/${task_name}/task_cache
 

diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py
@@ -278,7 +278,8 @@ def collect_in_memory(self) -> tuple[sp.csc_matrix, np.ndarray]:
             X_, y_ = self._get_shard_by_index(i)
             X.append(X_)
             y.append(y_)
-
+        if len(X) == 0 or len(y) == 0:
+            raise ValueError("No data found in the shards or labels. Please check input files.")
         X = sp.vstack(X)
         y = np.concatenate(y, axis=0)
         return X, y
@@ -315,6 +316,7 @@ def _train(self):
             early_stopping_rounds=self.cfg.model_params.early_stopping_rounds,
             # nthreads=self.cfg.nthreads,
             evals=[(self.dtrain, "train"), (self.dtuning, "tuning")],
+            verbose_eval=0,
         )
 
     @TimeableMixin.TimeAs
@@ -380,30 +382,33 @@ def main(cfg: DictConfig) -> float:
     - float: Evaluation result.
     """
 
-    print(OmegaConf.to_yaml(cfg))
+    # print(OmegaConf.to_yaml(cfg))
     if not cfg.loguru_init:
         hydra_loguru_init()
-
-    model = XGBoostModel(cfg)
-    model.train()
-    auc = model.evaluate()
-    logger.info(f"AUC: {auc}")
-
-    print(
-        "Time Profiling for window sizes ",
-        f"{cfg.tabularization.window_sizes} and min ",
-        f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
-    )
-    print("Train Time: \n", model._profile_durations())
-    # print("Train Iterator Time: \n", model.itrain._profile_durations())
-    # print("Tuning Iterator Time: \n", model.ituning._profile_durations())
-    # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())
-
-    # save model
-    save_dir = Path(cfg.output_dir)
-    save_dir.mkdir(parents=True, exist_ok=True)
-    model_time = datetime.now().strftime("%H%M%S%f")
-    model.model.save_model(save_dir / f"{auc:.4f}_model_{model_time}.json")
+    try:
+        model = XGBoostModel(cfg)
+        model.train()
+        auc = model.evaluate()
+        logger.info(f"AUC: {auc}")
+
+        # print(
+        #     "Time Profiling for window sizes ",
+        #     f"{cfg.tabularization.window_sizes} and min ",
+        #     f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
+        # )
+        # print("Train Time: \n", model._profile_durations())
+        # print("Train Iterator Time: \n", model.itrain._profile_durations())
+        # print("Tuning Iterator Time: \n", model.ituning._profile_durations())
+        # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())
+
+        # save model
+        save_dir = Path(cfg.output_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        model_time = datetime.now().strftime("%H%M%S%f")
+        model.model.save_model(save_dir / f"{auc:.4f}_model_{model_time}.json")
+    except Exception as e:
+        logger.error(f"Error occurred: {e}")
+        auc = 0.0
     return auc