Skip to content

Commit

Permalink
autogluon
Browse files Browse the repository at this point in the history
  • Loading branch information
teyaberg committed Aug 20, 2024
1 parent a8d8417 commit d07f6a2
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 25 deletions.
28 changes: 28 additions & 0 deletions src/MEDS_tabular_automl/configs/launch_autogluon.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
defaults:
- default
- tabularization: default
- override hydra/sweeper: optuna
- override hydra/sweeper/sampler: tpe
- override hydra/launcher: joblib
- _self_

task_name: task

# Task cached data dir
input_dir: ${output_cohort_dir}/${task_name}/task_cache
# Directory with task labels
input_label_dir: ${output_cohort_dir}/${task_name}/labels/
# Where to output the model and cached data
model_dir: ${output_cohort_dir}/autogluon/autogluon_${now:%Y-%m-%d_%H-%M-%S}
output_filepath: ${model_dir}

# Model parameters
model_params:
iterator:
keep_data_in_memory: True
binarize_task: True

log_dir: ${model_dir}/.logs/
log_filepath: ${log_dir}/log.txt

name: launch_autogluon
2 changes: 1 addition & 1 deletion src/MEDS_tabular_automl/configs/launch_xgboost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,6 @@ hydra:
model_params.num_boost_round: range(100, 1000)
model_params.early_stopping_rounds: range(1, 10)
+model_params.model.max_depth: range(2, 16)
tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
tabularization.min_code_inclusion_count: tag(log, range(10, 1000000))

name: launch_xgboost
10 changes: 1 addition & 9 deletions src/MEDS_tabular_automl/dense_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,13 @@ def __init__(self, cfg: DictConfig, split: str):
def densify(self) -> np.ndarray:
"""Builds the data as a dense matrix based on column subselection."""

# get the column indices to include
cols = self.get_feature_indices()

# map those to the feature names in the data
feature_names = self.get_all_column_names()
selected_features = [feature_names[col] for col in cols]

# get the dense matrix by iterating through the data shards
data = []
labels = []
for shard_idx in range(len(self._data_shards)):
shard_data, shard_labels = self.get_data_shards(shard_idx)
shard_data = shard_data[:, cols]
data.append(shard_data)
labels.append(shard_labels)
data = sp.vstack(data)
labels = np.concatenate(labels, axis=0)
return data, labels, selected_features
return data, labels
40 changes: 27 additions & 13 deletions src/MEDS_tabular_automl/scripts/launch_autogluon.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from ..utils import hydra_loguru_init

config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml")
config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_autogluon.yaml")
if not config_yaml.is_file():
raise FileNotFoundError("Core configuration not successfully installed!")

Expand All @@ -28,7 +28,7 @@ def main(cfg: DictConfig) -> float:

# check that autogluon is installed
try:
import autogluon as ag
import autogluon.tabular as ag
except ImportError:
logger.error("AutoGluon is not installed. Please install AutoGluon.")

Expand All @@ -38,23 +38,37 @@ def main(cfg: DictConfig) -> float:
iheld_out = DenseIterator(cfg, "held_out")

# collect data for AutoGluon
train_data, train_labels, cols = itrain.densify()
tuning_data, tuning_labels, _ = ituning.densify()
held_out_data, held_out_labels, _ = iheld_out.densify()
train_data, train_labels = itrain.densify()
tuning_data, tuning_labels = ituning.densify()
held_out_data, held_out_labels = iheld_out.densify()

# construct dfs for AutoGluon
train_df = pd.DataFrame(train_data.todense(), columns=cols)
train_df = pd.DataFrame(train_data.todense()) # , columns=cols)
train_df[cfg.task_name] = train_labels
tuning_df = pd.DataFrame(tuning_data.todense(), columns=cols)
tuning_df = pd.DataFrame(
tuning_data.todense(),
) # columns=cols)
tuning_df[cfg.task_name] = tuning_labels
held_out_df = pd.DataFrame(held_out_data.todense(), columns=cols)
held_out_df = pd.DataFrame(held_out_data.todense()) # , columns=cols)
held_out_df[cfg.task_name] = held_out_labels

# launch AutoGluon
predictor = ag.TabularPredictor(label=cfg.task_name).fit(train_data=train_df, tuning_data=tuning_df)
# TODO: fix logging, etc.
auc = predictor.evaluate(held_out_df)
logger.info(f"AUC: {auc}")
train_dataset = ag.TabularDataset(train_df)
tuning_dataset = ag.TabularDataset(tuning_df)
held_out_dataset = ag.TabularDataset(held_out_df)

# train model with AutoGluon
predictor = ag.TabularPredictor(
label=cfg.task_name, log_to_file=True, log_file_path=cfg.log_filepath, path=cfg.output_filepath
).fit(train_data=train_dataset, tuning_data=tuning_dataset)

# predict
predictions = predictor.predict(held_out_dataset.drop(columns=[cfg.task_name]))
print("Predictions:", predictions)
# evaluate
score = predictor.evaluate(held_out_dataset)
print("Test score:", score)

# TODO(model) add tests for autogluon pipeline


if __name__ == "__main__":
Expand Down
85 changes: 83 additions & 2 deletions src/MEDS_tabular_automl/tabular_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections.abc import Mapping
from pathlib import Path

import hydra
import numpy as np
import polars as pl
import scipy.sparse as sp
Expand Down Expand Up @@ -57,6 +58,9 @@ def __init__(self, cfg: DictConfig, split: str = "train"):

self.codes_set, self.code_masks, self.num_features = self._get_code_set()

self._set_scaler()
self._set_imputer()

@TimeableMixin.TimeAs
def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]:
"""Creates boolean masks for filtering features.
Expand Down Expand Up @@ -214,6 +218,54 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr
corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0]
return corrs

def _set_imputer(self):
"""Sets the imputer for the data."""
if hasattr(self.cfg.model_params.iterator, "impute"):
imputer = hydra.utils.instantiate(self.cfg.model_params.iterator.imputer)
if hasattr(imputer, "partial_fit"):
for i in range(len(self._data_shards)):
X, _ = self.get_data_shards(i)
imputer.partial_fit(X)
elif hasattr(imputer, "fit"):
imputer.fit(self.get_data_shards(0)[0])
else:
raise ValueError("Imputer must have a fit or partial_fit method.")
self.imputer = imputer
else:
self.imputer = None

def _set_scaler(self):
"""Sets the scaler for the data."""
if hasattr(self.cfg.model_params.iterator, "scaler"):
scaler = hydra.utils.instantiate(self.cfg.model_params.iterator.scaler)
if hasattr(scaler, "partial_fit"):
for i in range(len(self._data_shards)):
X, _ = self.get_data_shards(i)
scaler.partial_fit(X)
elif hasattr(scaler, "fit"):
X = self.get_data_shards(0)[0]
scaler.fit(X)
else:
raise ValueError("Scaler must have a fit or partial_fit method.")
self.scaler = scaler
else:
self.scaler = None

def _impute_and_scale_data(self, data: sp.csc_matrix) -> sp.csc_matrix:
"""Scales the data using the fitted scaler.
Args:
data: The data to scale.
Returns:
The scaled data.
"""
if self.imputer is not None:
data = self.imputer.transform(data)
if self.scaler is not None:
return self.scaler.transform(data)
return data

@TimeableMixin.TimeAs
def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix:
"""Loads a specific data shard into memory as a sparse matrix.
Expand Down Expand Up @@ -320,7 +372,7 @@ def get_data_shards(self, idx: int | list[int]) -> tuple[sp.csc_matrix, np.ndarr
idx = [idx]
for i in idx:
X_, y_ = self._get_shard_by_index(i)
X.append(X_)
X.append(self._impute_and_scale_data(X_))
y.append(y_)
if len(X) == 0 or len(y) == 0:
raise ValueError("No data found in the shards or labels. Please check input files.")
Expand Down Expand Up @@ -406,5 +458,34 @@ def extract_name(test_file):
all_feats.append(f"{feat_name}/{agg}/{window}")

# filter by only those in the list of indices
all_feats = [all_feats[i] for i in indices]
if indices is not None:
all_feats = [all_feats[i] for i in indices]
return all_feats

def get_columns_and_indices(self) -> tuple[list[str], list[int]]:
"""Retrieves the names and indices of the columns in the data.
Returns:
A tuple containing the names of the columns and their indices.
"""
raise NotImplementedError("This method is not implemented yet.")
files = get_model_files(self.cfg, self.split, self._data_shards[0])

def extract_name(test_file):
return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem))

agg_wind_combos = [extract_name(test_file) for test_file in files]

feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp)
all_feats = []
all_indices = []
for agg_wind in agg_wind_combos:
window, feat, agg = agg_wind.split("/")
feature_ids = get_feature_indices(feat + "/" + agg, feature_columns)
feature_names = [feature_columns[i] for i in feature_ids]
for feat_name in feature_names:
all_feats.append(f"{feat_name}/{agg}/{window}")
# use mask to append indices
all_indices.extend(feature_ids)

return all_feats, all_indices

0 comments on commit d07f6a2

Please sign in to comment.