autogluon

mmcdermott · Aug 20, 2024 · d07f6a2 · d07f6a2
1 parent a8d8417
commit d07f6a2
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 25 deletions.
diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml
@@ -0,0 +1,28 @@
+defaults:
+  - default
+  - tabularization: default
+  - override hydra/sweeper: optuna
+  - override hydra/sweeper/sampler: tpe
+  - override hydra/launcher: joblib
+  - _self_
+
+task_name: task
+
+# Task cached data dir
+input_dir: ${output_cohort_dir}/${task_name}/task_cache
+# Directory with task labels
+input_label_dir: ${output_cohort_dir}/${task_name}/labels/
+# Where to output the model and cached data
+model_dir: ${output_cohort_dir}/autogluon/autogluon_${now:%Y-%m-%d_%H-%M-%S}
+output_filepath: ${model_dir}
+
+# Model parameters
+model_params:
+  iterator:
+    keep_data_in_memory: True
+    binarize_task: True
+
+log_dir: ${model_dir}/.logs/
+log_filepath: ${log_dir}/log.txt
+
+name: launch_autogluon
diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml
@@ -53,6 +53,6 @@ hydra:
       model_params.num_boost_round: range(100, 1000)
       model_params.early_stopping_rounds: range(1, 10)
       +model_params.model.max_depth: range(2, 16)
-      tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
+      tabularization.min_code_inclusion_count: tag(log, range(10, 1000000))
 
 name: launch_xgboost
diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py
@@ -25,21 +25,13 @@ def __init__(self, cfg: DictConfig, split: str):
     def densify(self) -> np.ndarray:
         """Builds the data as a dense matrix based on column subselection."""
 
-        # get the column indices to include
-        cols = self.get_feature_indices()
-
-        # map those to the feature names in the data
-        feature_names = self.get_all_column_names()
-        selected_features = [feature_names[col] for col in cols]
-
         # get the dense matrix by iterating through the data shards
         data = []
         labels = []
         for shard_idx in range(len(self._data_shards)):
             shard_data, shard_labels = self.get_data_shards(shard_idx)
-            shard_data = shard_data[:, cols]
             data.append(shard_data)
             labels.append(shard_labels)
         data = sp.vstack(data)
         labels = np.concatenate(labels, axis=0)
-        return data, labels, selected_features
+        return data, labels
diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py
@@ -9,7 +9,7 @@
 
 from ..utils import hydra_loguru_init
 
-config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml")
+config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_autogluon.yaml")
 if not config_yaml.is_file():
     raise FileNotFoundError("Core configuration not successfully installed!")
 
@@ -28,7 +28,7 @@ def main(cfg: DictConfig) -> float:
 
     # check that autogluon is installed
     try:
-        import autogluon as ag
+        import autogluon.tabular as ag
     except ImportError:
         logger.error("AutoGluon is not installed. Please install AutoGluon.")
 
@@ -38,23 +38,37 @@ def main(cfg: DictConfig) -> float:
     iheld_out = DenseIterator(cfg, "held_out")
 
     # collect data for AutoGluon
-    train_data, train_labels, cols = itrain.densify()
-    tuning_data, tuning_labels, _ = ituning.densify()
-    held_out_data, held_out_labels, _ = iheld_out.densify()
+    train_data, train_labels = itrain.densify()
+    tuning_data, tuning_labels = ituning.densify()
+    held_out_data, held_out_labels = iheld_out.densify()
 
     # construct dfs for AutoGluon
-    train_df = pd.DataFrame(train_data.todense(), columns=cols)
+    train_df = pd.DataFrame(train_data.todense())  # , columns=cols)
     train_df[cfg.task_name] = train_labels
-    tuning_df = pd.DataFrame(tuning_data.todense(), columns=cols)
+    tuning_df = pd.DataFrame(
+        tuning_data.todense(),
+    )  # columns=cols)
     tuning_df[cfg.task_name] = tuning_labels
-    held_out_df = pd.DataFrame(held_out_data.todense(), columns=cols)
+    held_out_df = pd.DataFrame(held_out_data.todense())  # , columns=cols)
     held_out_df[cfg.task_name] = held_out_labels
 
-    # launch AutoGluon
-    predictor = ag.TabularPredictor(label=cfg.task_name).fit(train_data=train_df, tuning_data=tuning_df)
-    # TODO: fix logging, etc.
-    auc = predictor.evaluate(held_out_df)
-    logger.info(f"AUC: {auc}")
+    train_dataset = ag.TabularDataset(train_df)
+    tuning_dataset = ag.TabularDataset(tuning_df)
+    held_out_dataset = ag.TabularDataset(held_out_df)
+
+    # train model with AutoGluon
+    predictor = ag.TabularPredictor(
+        label=cfg.task_name, log_to_file=True, log_file_path=cfg.log_filepath, path=cfg.output_filepath
+    ).fit(train_data=train_dataset, tuning_data=tuning_dataset)
+
+    # predict
+    predictions = predictor.predict(held_out_dataset.drop(columns=[cfg.task_name]))
+    print("Predictions:", predictions)
+    # evaluate
+    score = predictor.evaluate(held_out_dataset)
+    print("Test score:", score)
+
+    # TODO(model) add tests for autogluon pipeline
 
 
 if __name__ == "__main__":

diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py
@@ -1,6 +1,7 @@
 from collections.abc import Mapping
 from pathlib import Path
 
+import hydra
 import numpy as np
 import polars as pl
 import scipy.sparse as sp
@@ -57,6 +58,9 @@ def __init__(self, cfg: DictConfig, split: str = "train"):
 
         self.codes_set, self.code_masks, self.num_features = self._get_code_set()
 
+        self._set_scaler()
+        self._set_imputer()
+
     @TimeableMixin.TimeAs
     def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]:
         """Creates boolean masks for filtering features.
@@ -214,6 +218,54 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr
             corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0]
         return corrs
 
+    def _set_imputer(self):
+        """Sets the imputer for the data."""
+        if hasattr(self.cfg.model_params.iterator, "impute"):
+            imputer = hydra.utils.instantiate(self.cfg.model_params.iterator.imputer)
+            if hasattr(imputer, "partial_fit"):
+                for i in range(len(self._data_shards)):
+                    X, _ = self.get_data_shards(i)
+                    imputer.partial_fit(X)
+            elif hasattr(imputer, "fit"):
+                imputer.fit(self.get_data_shards(0)[0])
+            else:
+                raise ValueError("Imputer must have a fit or partial_fit method.")
+            self.imputer = imputer
+        else:
+            self.imputer = None
+
+    def _set_scaler(self):
+        """Sets the scaler for the data."""
+        if hasattr(self.cfg.model_params.iterator, "scaler"):
+            scaler = hydra.utils.instantiate(self.cfg.model_params.iterator.scaler)
+            if hasattr(scaler, "partial_fit"):
+                for i in range(len(self._data_shards)):
+                    X, _ = self.get_data_shards(i)
+                    scaler.partial_fit(X)
+            elif hasattr(scaler, "fit"):
+                X = self.get_data_shards(0)[0]
+                scaler.fit(X)
+            else:
+                raise ValueError("Scaler must have a fit or partial_fit method.")
+            self.scaler = scaler
+        else:
+            self.scaler = None
+
+    def _impute_and_scale_data(self, data: sp.csc_matrix) -> sp.csc_matrix:
+        """Scales the data using the fitted scaler.
+
+        Args:
+            data: The data to scale.
+
+        Returns:
+            The scaled data.
+        """
+        if self.imputer is not None:
+            data = self.imputer.transform(data)
+        if self.scaler is not None:
+            return self.scaler.transform(data)
+        return data
+
     @TimeableMixin.TimeAs
     def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix:
         """Loads a specific data shard into memory as a sparse matrix.
@@ -320,7 +372,7 @@ def get_data_shards(self, idx: int | list[int]) -> tuple[sp.csc_matrix, np.ndarr
             idx = [idx]
         for i in idx:
             X_, y_ = self._get_shard_by_index(i)
-            X.append(X_)
+            X.append(self._impute_and_scale_data(X_))
             y.append(y_)
         if len(X) == 0 or len(y) == 0:
             raise ValueError("No data found in the shards or labels. Please check input files.")
@@ -406,5 +458,34 @@ def extract_name(test_file):
                 all_feats.append(f"{feat_name}/{agg}/{window}")
 
         # filter by only those in the list of indices
-        all_feats = [all_feats[i] for i in indices]
+        if indices is not None:
+            all_feats = [all_feats[i] for i in indices]
         return all_feats
+
+    def get_columns_and_indices(self) -> tuple[list[str], list[int]]:
+        """Retrieves the names and indices of the columns in the data.
+
+        Returns:
+            A tuple containing the names of the columns and their indices.
+        """
+        raise NotImplementedError("This method is not implemented yet.")
+        files = get_model_files(self.cfg, self.split, self._data_shards[0])
+
+        def extract_name(test_file):
+            return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem))
+
+        agg_wind_combos = [extract_name(test_file) for test_file in files]
+
+        feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp)
+        all_feats = []
+        all_indices = []
+        for agg_wind in agg_wind_combos:
+            window, feat, agg = agg_wind.split("/")
+            feature_ids = get_feature_indices(feat + "/" + agg, feature_columns)
+            feature_names = [feature_columns[i] for i in feature_ids]
+            for feat_name in feature_names:
+                all_feats.append(f"{feat_name}/{agg}/{window}")
+            # use mask to append indices
+            all_indices.extend(feature_ids)
+
+        return all_feats, all_indices