Skip to content

Commit

Permalink
[wip] sharing for updates only
Browse files Browse the repository at this point in the history
  • Loading branch information
teyaberg committed Aug 20, 2024
1 parent f3c985a commit b65754c
Show file tree
Hide file tree
Showing 11 changed files with 39 additions and 52 deletions.
3 changes: 1 addition & 2 deletions src/MEDS_tabular_automl/base_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Dict, Type
from abc import ABC, abstractmethod
from pathlib import Path
from omegaconf import DictConfig

from mixins import TimeableMixin


Expand Down
4 changes: 2 additions & 2 deletions src/MEDS_tabular_automl/configs/launch_model.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
defaults:
- default
- tabularization: default
- model: xgboost # This can be changed to sgd_classifier or any other model
- model: xgboost # This can be changed to sgd_classifier or any other model
- override hydra/sweeper: optuna
- override hydra/sweeper/sampler: tpe
- override hydra/launcher: joblib
Expand All @@ -19,4 +19,4 @@ output_filepath: ${model_dir}/model_metadata.json

log_dir: ${model_dir}/.logs/

name: launch_model
name: launch_model
2 changes: 1 addition & 1 deletion src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ hydra:
+model_params.model.l1_ratio: interval(0, 1)
+model_params.model.penalty: choice(['l1', 'l2', 'elasticnet'])
model_params.epochs: range(10, 100)
model_params.early_stopping_rounds: range(1, 10)
model_params.early_stopping_rounds: range(1, 10)
2 changes: 1 addition & 1 deletion src/MEDS_tabular_automl/configs/models/xgboost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ hydra:
model_params.num_boost_round: range(100, 1000)
model_params.early_stopping_rounds: range(1, 10)
+model_params.model.max_depth: range(2, 16)
tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
9 changes: 0 additions & 9 deletions src/MEDS_tabular_automl/dense_iterator.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
from pathlib import Path

import hydra
import numpy as np
import scipy.sparse as sp
from loguru import logger
from mixins import TimeableMixin
from omegaconf import DictConfig
from sklearn.metrics import roc_auc_score

from .tabular_dataset import TabularDataset
from .base_model import BaseModel


class DenseIterator(TabularDataset, TimeableMixin):

def __init__(self, cfg: DictConfig, split: str):
"""Initializes the SklearnIterator with the provided configuration and data split.
Expand Down Expand Up @@ -50,5 +43,3 @@ def densify(self) -> np.ndarray:
data = sp.vstack(data)
labels = np.concatenate(labels, axis=0)
return data, labels, selected_features


9 changes: 5 additions & 4 deletions src/MEDS_tabular_automl/scripts/launch_autogluon.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from importlib.resources import files
from pathlib import Path

import hydra
import pandas as pd
Expand All @@ -21,13 +20,12 @@ def main(cfg: DictConfig) -> float:
Args:
cfg: The configuration dictionary specifying model and training parameters.
"""

# print(OmegaConf.to_yaml(cfg))
if not cfg.loguru_init:
hydra_loguru_init()

# check that autogluon is installed
try:
import autogluon as ag
Expand All @@ -54,7 +52,10 @@ def main(cfg: DictConfig) -> float:

# launch AutoGluon
predictor = ag.TabularPredictor(label=cfg.task_name).fit(train_data=train_df, tuning_data=tuning_df)

# TODO: fix logging, etc.
auc = predictor.evaluate(held_out_df)
logger.info(f"AUC: {auc}")


if __name__ == "__main__":
main()
9 changes: 2 additions & 7 deletions src/MEDS_tabular_automl/scripts/launch_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,12 @@
import hydra
from loguru import logger
from omegaconf import DictConfig
from typing import Dict, Type

from MEDS_tabular_automl.base_model import BaseModel
from MEDS_tabular_automl.sklearn_model import SklearnModel
from MEDS_tabular_automl.xgboost_model import XGBoostModel


MODEL_CLASSES: Dict[str, Type[BaseModel]] = {
"xgboost": XGBoostModel,
"sklearn": SklearnModel
}
MODEL_CLASSES: dict[str, type[BaseModel]] = {"xgboost": XGBoostModel, "sklearn": SklearnModel}

from ..utils import hydra_loguru_init

Expand Down Expand Up @@ -42,7 +37,7 @@ def main(cfg: DictConfig) -> float:
ModelClass = MODEL_CLASSES.get(model_type)
if ModelClass is None:
raise ValueError(f"Model type {model_type} not supported.")

model = ModelClass(cfg)
model.train()
auc = model.evaluate()
Expand Down
48 changes: 24 additions & 24 deletions src/MEDS_tabular_automl/scripts/launch_xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,30 @@ def main(cfg: DictConfig) -> float:
# print(OmegaConf.to_yaml(cfg))
if not cfg.loguru_init:
hydra_loguru_init()
# try:
model = XGBoostModel(cfg)
model.train()
auc = model.evaluate()
logger.info(f"AUC: {auc}")

# print(
# "Time Profiling for window sizes ",
# f"{cfg.tabularization.window_sizes} and min ",
# f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
# )
# print("Train Time: \n", model._profile_durations())
# print("Train Iterator Time: \n", model.itrain._profile_durations())
# print("Tuning Iterator Time: \n", model.ituning._profile_durations())
# print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())

# save model
output_fp = Path(cfg.output_filepath)
output_fp.parent.mkdir(parents=True, exist_ok=True)

model.save_model(output_fp)
# except Exception as e:
# logger.error(f"Error occurred: {e}")
# auc = 0.0
try:
model = XGBoostModel(cfg)
model.train()
auc = model.evaluate()
logger.info(f"AUC: {auc}")

# print(
# "Time Profiling for window sizes ",
# f"{cfg.tabularization.window_sizes} and min ",
# f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
# )
# print("Train Time: \n", model._profile_durations())
# print("Train Iterator Time: \n", model.itrain._profile_durations())
# print("Tuning Iterator Time: \n", model.ituning._profile_durations())
# print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())

# save model
output_fp = Path(cfg.output_filepath)
output_fp.parent.mkdir(parents=True, exist_ok=True)

model.save_model(output_fp)
except Exception as e:
logger.error(f"Error occurred: {e}")
auc = 0.0
return auc


Expand Down
2 changes: 1 addition & 1 deletion src/MEDS_tabular_automl/sklearn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from omegaconf import DictConfig
from sklearn.metrics import roc_auc_score

from .tabular_dataset import TabularDataset
from .base_model import BaseModel
from .tabular_dataset import TabularDataset


class SklearnIterator(TabularDataset, TimeableMixin):
Expand Down
1 change: 1 addition & 0 deletions src/MEDS_tabular_automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def load_tqdm(use_tqdm: bool):

return tqdm
else:

def noop(x, **kwargs):
return x

Expand Down
2 changes: 1 addition & 1 deletion src/MEDS_tabular_automl/xgboost_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from omegaconf import DictConfig, OmegaConf
from sklearn.metrics import roc_auc_score

from .tabular_dataset import TabularDataset
from .base_model import BaseModel
from .tabular_dataset import TabularDataset


class XGBIterator(xgb.DataIter, TabularDataset, TimeableMixin):
Expand Down

0 comments on commit b65754c

Please sign in to comment.