[wip] sharing for updates only

mmcdermott · Aug 20, 2024 · b65754c · b65754c
1 parent f3c985a
commit b65754c
Show file tree

Hide file tree

Showing 11 changed files with 39 additions and 52 deletions.
diff --git a/src/MEDS_tabular_automl/base_model.py b/src/MEDS_tabular_automl/base_model.py
@@ -1,7 +1,6 @@
-from typing import Dict, Type
 from abc import ABC, abstractmethod
 from pathlib import Path
-from omegaconf import DictConfig
+
 from mixins import TimeableMixin
 
 

diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml
@@ -1,7 +1,7 @@
 defaults:
   - default
   - tabularization: default
-  - model: xgboost  # This can be changed to sgd_classifier or any other model
+  - model: xgboost # This can be changed to sgd_classifier or any other model
   - override hydra/sweeper: optuna
   - override hydra/sweeper/sampler: tpe
   - override hydra/launcher: joblib
@@ -19,4 +19,4 @@ output_filepath: ${model_dir}/model_metadata.json
 
 log_dir: ${model_dir}/.logs/
 
-name: launch_model
+name: launch_model
diff --git a/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml
@@ -16,4 +16,4 @@ hydra:
       +model_params.model.l1_ratio: interval(0, 1)
       +model_params.model.penalty: choice(['l1', 'l2', 'elasticnet'])
       model_params.epochs: range(10, 100)
-      model_params.early_stopping_rounds: range(1, 10)
+      model_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/models/xgboost.yaml b/src/MEDS_tabular_automl/configs/models/xgboost.yaml
@@ -24,4 +24,4 @@ hydra:
       model_params.num_boost_round: range(100, 1000)
       model_params.early_stopping_rounds: range(1, 10)
       +model_params.model.max_depth: range(2, 16)
-      tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
+      tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py
@@ -1,19 +1,12 @@
-from pathlib import Path
-
-import hydra
 import numpy as np
 import scipy.sparse as sp
-from loguru import logger
 from mixins import TimeableMixin
 from omegaconf import DictConfig
-from sklearn.metrics import roc_auc_score
 
 from .tabular_dataset import TabularDataset
-from .base_model import BaseModel
 
 
 class DenseIterator(TabularDataset, TimeableMixin):
-
     def __init__(self, cfg: DictConfig, split: str):
         """Initializes the SklearnIterator with the provided configuration and data split.
 
@@ -50,5 +43,3 @@ def densify(self) -> np.ndarray:
         data = sp.vstack(data)
         labels = np.concatenate(labels, axis=0)
         return data, labels, selected_features
-
-
diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py
@@ -1,5 +1,4 @@
 from importlib.resources import files
-from pathlib import Path
 
 import hydra
 import pandas as pd
@@ -21,13 +20,12 @@ def main(cfg: DictConfig) -> float:
 
     Args:
         cfg: The configuration dictionary specifying model and training parameters.
-
     """
 
     # print(OmegaConf.to_yaml(cfg))
     if not cfg.loguru_init:
         hydra_loguru_init()
-   
+
     # check that autogluon is installed
     try:
         import autogluon as ag
@@ -54,7 +52,10 @@ def main(cfg: DictConfig) -> float:
 
     # launch AutoGluon
     predictor = ag.TabularPredictor(label=cfg.task_name).fit(train_data=train_df, tuning_data=tuning_df)
-
+    # TODO: fix logging, etc.
+    auc = predictor.evaluate(held_out_df)
+    logger.info(f"AUC: {auc}")
+
 
 if __name__ == "__main__":
     main()
diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py
@@ -4,17 +4,12 @@
 import hydra
 from loguru import logger
 from omegaconf import DictConfig
-from typing import Dict, Type
 
 from MEDS_tabular_automl.base_model import BaseModel
 from MEDS_tabular_automl.sklearn_model import SklearnModel
 from MEDS_tabular_automl.xgboost_model import XGBoostModel
 
-
-MODEL_CLASSES: Dict[str, Type[BaseModel]] = {
-    "xgboost": XGBoostModel,
-    "sklearn": SklearnModel
-}
+MODEL_CLASSES: dict[str, type[BaseModel]] = {"xgboost": XGBoostModel, "sklearn": SklearnModel}
 
 from ..utils import hydra_loguru_init
 
@@ -42,7 +37,7 @@ def main(cfg: DictConfig) -> float:
         ModelClass = MODEL_CLASSES.get(model_type)
         if ModelClass is None:
             raise ValueError(f"Model type {model_type} not supported.")
-        
+
         model = ModelClass(cfg)
         model.train()
         auc = model.evaluate()

diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py
@@ -27,30 +27,30 @@ def main(cfg: DictConfig) -> float:
     # print(OmegaConf.to_yaml(cfg))
     if not cfg.loguru_init:
         hydra_loguru_init()
-    # try:
-    model = XGBoostModel(cfg)
-    model.train()
-    auc = model.evaluate()
-    logger.info(f"AUC: {auc}")
-
-    # print(
-    #     "Time Profiling for window sizes ",
-    #     f"{cfg.tabularization.window_sizes} and min ",
-    #     f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
-    # )
-    # print("Train Time: \n", model._profile_durations())
-    # print("Train Iterator Time: \n", model.itrain._profile_durations())
-    # print("Tuning Iterator Time: \n", model.ituning._profile_durations())
-    # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())
-
-    # save model
-    output_fp = Path(cfg.output_filepath)
-    output_fp.parent.mkdir(parents=True, exist_ok=True)
-
-    model.save_model(output_fp)
-    # except Exception as e:
-    #     logger.error(f"Error occurred: {e}")
-    #     auc = 0.0
+    try:
+        model = XGBoostModel(cfg)
+        model.train()
+        auc = model.evaluate()
+        logger.info(f"AUC: {auc}")
+
+        # print(
+        #     "Time Profiling for window sizes ",
+        #     f"{cfg.tabularization.window_sizes} and min ",
+        #     f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
+        # )
+        # print("Train Time: \n", model._profile_durations())
+        # print("Train Iterator Time: \n", model.itrain._profile_durations())
+        # print("Tuning Iterator Time: \n", model.ituning._profile_durations())
+        # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())
+
+        # save model
+        output_fp = Path(cfg.output_filepath)
+        output_fp.parent.mkdir(parents=True, exist_ok=True)
+
+        model.save_model(output_fp)
+    except Exception as e:
+        logger.error(f"Error occurred: {e}")
+        auc = 0.0
     return auc
 
 

diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py
@@ -8,8 +8,8 @@
 from omegaconf import DictConfig
 from sklearn.metrics import roc_auc_score
 
-from .tabular_dataset import TabularDataset
 from .base_model import BaseModel
+from .tabular_dataset import TabularDataset
 
 
 class SklearnIterator(TabularDataset, TimeableMixin):

diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py
@@ -108,6 +108,7 @@ def load_tqdm(use_tqdm: bool):
 
         return tqdm
     else:
+
         def noop(x, **kwargs):
             return x
 

diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py
@@ -8,8 +8,8 @@
 from omegaconf import DictConfig, OmegaConf
 from sklearn.metrics import roc_auc_score
 
-from .tabular_dataset import TabularDataset
 from .base_model import BaseModel
+from .tabular_dataset import TabularDataset
 
 
 class XGBIterator(xgb.DataIter, TabularDataset, TimeableMixin):
-Original file line number
+Diff line change
@@ Expand Up / @@ -108,6 +108,7 @@ def load_tqdm(use_tqdm: bool): @@
             return tqdm
         else:
             def noop(x, **kwargs):
                 return x
@@ Expand Down @@