Merge pull request #91 from mmcdermott/improve_test_coverage

Re-worked the tabular dataset config a bit.
mmcdermott · Sep 9, 2024 · c981534 · c981534
2 parents 0db7bd6 + aed27f1
commit c981534
Show file tree

Hide file tree

Showing 12 changed files with 35 additions and 134 deletions.
diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml
@@ -13,10 +13,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
 
       - name: Run pre-commits
         uses: pre-commit/[email protected]
diff --git a/.github/workflows/code-quality-pr.yaml b/.github/workflows/code-quality-pr.yaml
@@ -16,10 +16,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
 
       - name: Find modified files
         id: file_changes

diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -12,7 +12,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.x"
+          python-version: "3.11"
       - name: Install pypa/build
         run: >-
           python3 -m

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -19,10 +19,10 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,5 @@
 default_language_version:
-  python: python3.12
-
-exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports"
+  python: python3.11
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks

diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py
diff --git a/src/MEDS_tabular_automl/evaluation_callback.py b/src/MEDS_tabular_automl/evaluation_callback.py
@@ -3,13 +3,10 @@
 import polars as pl
 from hydra.experimental.callback import Callback
 from loguru import logger
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import DictConfig
 
 
 class EvaluationCallback(Callback):
-    def __init__(self, **kwargs):
-        self.kwargs = kwargs
-
     def on_multirun_end(self, config: DictConfig, **kwargs):
         """Find best model based on log files and logger.info its performance and hyperparameters."""
         log_fp = Path(config.model_logging.model_log_dir)
@@ -27,7 +24,6 @@ def on_multirun_end(self, config: DictConfig, **kwargs):
 
         logger.info(f"The best model can be found at {best_model}")
         self.log_performance(performance[0, :])
-        # self.log_hyperparams(log_fp / best_model / f"{config.model_logging.config_log_stem}.log")
         if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0:
             self.delete_below_top_k_models(
                 performance, config.model_saving.delete_below_top_k, config.model_saving.model_dir
@@ -43,15 +39,6 @@ def log_performance(self, best_model_performance):
             f"\nPerformance of best model:\nTuning AUC: {tuning_auc}\nTest AUC: {test_auc}",
         )
 
-    def log_hyperparams(self, best_params_fp):
-        """logger.info hyperparameters of the best model with nice formatting."""
-        # check if this file exists
-        if not best_params_fp.is_file():
-            raise FileNotFoundError(f"Best hyperparameters file not found at {best_params_fp}")
-        best_params = OmegaConf.load(best_params_fp)
-        # print using OmegaConf.to_yaml
-        logger.info(f"\nHyperparameters of the best model:\n{OmegaConf.to_yaml(best_params)}")
-
     def delete_below_top_k_models(self, performance, k, model_dir):
         """Save only top k models from the model directory and delete all other files."""
         top_k_models = performance.head(k)["model_fp"].values

diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py
@@ -12,7 +12,7 @@
 except ImportError:
     ag = None
 
-from MEDS_tabular_automl.dense_iterator import DenseIterator
+from MEDS_tabular_automl.tabular_dataset import TabularDataset as DenseIterator
 
 from ..utils import hydra_loguru_init, stage_init
 

diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py
@@ -8,46 +8,7 @@
 from sklearn.metrics import roc_auc_score
 
 from .base_model import BaseModel
-from .tabular_dataset import TabularDataset
-
-
-class SklearnIterator(TabularDataset):
-    """SklearnIterator class for loading and processing data shards for use in SciKit-Learn models.
-
-    This class provides functionality for iterating through data shards, loading
-    feature data and labels, and processing them based on the provided configuration.
-
-    Args:
-        cfg: A configuration dictionary containing parameters for
-            data processing, feature selection, and other settings.
-        split: The data split to use, which can be one of "train", "tuning",
-            or "held_out". This determines which subset of the data is loaded and processed.
-
-    Attributes:
-        cfg: Configuration dictionary containing parameters for
-            data processing, feature selection, and other settings.
-        file_name_resolver: Object for resolving file names and paths based on the configuration.
-        split: The data split being used for loading and processing data shards.
-        _data_shards: List of data shard names.
-        valid_event_ids: Dictionary mapping shard number to a list of valid event IDs.
-        labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs.
-        codes_set: Set of codes to include in the data.
-        code_masks: Dictionary of code masks for filtering features based on aggregation.
-        num_features: Total number of features in the data.
-    """
-
-    def __init__(self, cfg: DictConfig, split: str):
-        """Initializes the SklearnIterator with the provided configuration and data split.
-
-        Args:
-            cfg: The configuration dictionary.
-            split: The data split to use.
-        """
-        super().__init__(cfg=cfg, split=split)
-        self.valid_event_ids, self.labels = self._load_ids_and_labels()
-        # check if the labels are empty
-        if len(self.labels) == 0:
-            raise ValueError("No labels found.")
+from .tabular_dataset import TabularDataset as SklearnIterator
 
 
 class SklearnMatrix:

diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py
@@ -69,6 +69,11 @@ def __init__(self, cfg: DictConfig, split: str = "train"):
         self._set_scaler()
         self._set_imputer()
 
+        self.valid_event_ids, self.labels = self._load_ids_and_labels()
+        # check if the labels are empty
+        if len(self.labels) == 0:
+            raise ValueError("No labels found.")
+
     @TimeableMixin.TimeAs
     def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]:
         """Creates boolean masks for filtering features.
@@ -478,30 +483,16 @@ def extract_name(test_file):
             all_feats = [all_feats[i] for i in indices]
         return all_feats
 
-    def get_columns_and_indices(self) -> tuple[list[str], list[int]]:
-        """Retrieves the names and indices of the columns in the data.
-
-        Returns:
-            A tuple containing the names of the columns and their indices.
-        """
-        raise NotImplementedError("This method is not implemented yet.")
-        files = get_model_files(self.cfg, self.split, self._data_shards[0])
-
-        def extract_name(test_file):
-            return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem))
-
-        agg_wind_combos = [extract_name(test_file) for test_file in files]
-
-        feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp)
-        all_feats = []
-        all_indices = []
-        for agg_wind in agg_wind_combos:
-            window, feat, agg = agg_wind.split("/")
-            feature_ids = get_feature_indices(feat + "/" + agg, feature_columns)
-            feature_names = [feature_columns[i] for i in feature_ids]
-            for feat_name in feature_names:
-                all_feats.append(f"{feat_name}/{agg}/{window}")
-            # use mask to append indices
-            all_indices.extend(feature_ids)
-
-        return all_feats, all_indices
+    def densify(self) -> np.ndarray:
+        """Builds the data as a dense matrix based on column subselection."""
+
+        # get the dense matrix by iterating through the data shards
+        data = []
+        labels = []
+        for shard_idx in range(len(self._data_shards)):
+            shard_data, shard_labels = self.get_data_shards(shard_idx)
+            data.append(shard_data)
+            labels.append(shard_labels)
+        data = sp.vstack(data)
+        labels = np.concatenate(labels, axis=0)
+        return data, labels
diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py
@@ -45,10 +45,7 @@ def __init__(self, cfg: DictConfig, split: str):
         """
         xgb.DataIter.__init__(self, cache_prefix=Path(cfg.path.cache_dir))
         TabularDataset.__init__(self, cfg=cfg, split=split)
-        self.valid_event_ids, self.labels = self._load_ids_and_labels()
-        # check if the labels are empty
-        if self.labels is None:
-            raise ValueError("No labels found.")
+
         self._it = 0
 
     def next(self, input_data: Callable) -> int:

diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py
@@ -275,7 +275,7 @@ def test_tabularize(tmp_path):
             f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!"
         )
     output_files = list_subdir_files(str(Path(cfg.output_tabularized_dir).resolve()), "npz")
-    for split in split_json.keys():
+    for split in split_json:
         for window in cfg.tabularization.window_sizes:
             for agg in cfg.tabularization.aggs:
                 if agg.startswith("static"):