Skip to content

Commit

Permalink
Merge pull request #91 from mmcdermott/improve_test_coverage
Browse files Browse the repository at this point in the history
Re-worked the tabular dataset config a bit.
  • Loading branch information
mmcdermott authored Sep 9, 2024
2 parents 0db7bd6 + aed27f1 commit c981534
Show file tree
Hide file tree
Showing 12 changed files with 35 additions and 134 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/code-quality-main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ jobs:

steps:
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Run pre-commits
uses: pre-commit/[email protected]
6 changes: 4 additions & 2 deletions .github/workflows/code-quality-pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ jobs:

steps:
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Find modified files
id: file_changes
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish-to-pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.x"
python-version: "3.11"
- name: Install pypa/build
run: >-
python3 -m
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ jobs:

steps:
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

Expand Down
4 changes: 1 addition & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
default_language_version:
python: python3.12

exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports"
python: python3.11

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down
37 changes: 0 additions & 37 deletions src/MEDS_tabular_automl/dense_iterator.py

This file was deleted.

15 changes: 1 addition & 14 deletions src/MEDS_tabular_automl/evaluation_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@
import polars as pl
from hydra.experimental.callback import Callback
from loguru import logger
from omegaconf import DictConfig, OmegaConf
from omegaconf import DictConfig


class EvaluationCallback(Callback):
def __init__(self, **kwargs):
self.kwargs = kwargs

def on_multirun_end(self, config: DictConfig, **kwargs):
"""Find best model based on log files and logger.info its performance and hyperparameters."""
log_fp = Path(config.model_logging.model_log_dir)
Expand All @@ -27,7 +24,6 @@ def on_multirun_end(self, config: DictConfig, **kwargs):

logger.info(f"The best model can be found at {best_model}")
self.log_performance(performance[0, :])
# self.log_hyperparams(log_fp / best_model / f"{config.model_logging.config_log_stem}.log")
if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0:
self.delete_below_top_k_models(
performance, config.model_saving.delete_below_top_k, config.model_saving.model_dir
Expand All @@ -43,15 +39,6 @@ def log_performance(self, best_model_performance):
f"\nPerformance of best model:\nTuning AUC: {tuning_auc}\nTest AUC: {test_auc}",
)

def log_hyperparams(self, best_params_fp):
"""logger.info hyperparameters of the best model with nice formatting."""
# check if this file exists
if not best_params_fp.is_file():
raise FileNotFoundError(f"Best hyperparameters file not found at {best_params_fp}")
best_params = OmegaConf.load(best_params_fp)
# print using OmegaConf.to_yaml
logger.info(f"\nHyperparameters of the best model:\n{OmegaConf.to_yaml(best_params)}")

def delete_below_top_k_models(self, performance, k, model_dir):
"""Save only top k models from the model directory and delete all other files."""
top_k_models = performance.head(k)["model_fp"].values
Expand Down
2 changes: 1 addition & 1 deletion src/MEDS_tabular_automl/scripts/launch_autogluon.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
except ImportError:
ag = None

from MEDS_tabular_automl.dense_iterator import DenseIterator
from MEDS_tabular_automl.tabular_dataset import TabularDataset as DenseIterator

from ..utils import hydra_loguru_init, stage_init

Expand Down
41 changes: 1 addition & 40 deletions src/MEDS_tabular_automl/sklearn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,46 +8,7 @@
from sklearn.metrics import roc_auc_score

from .base_model import BaseModel
from .tabular_dataset import TabularDataset


class SklearnIterator(TabularDataset):
"""SklearnIterator class for loading and processing data shards for use in SciKit-Learn models.
This class provides functionality for iterating through data shards, loading
feature data and labels, and processing them based on the provided configuration.
Args:
cfg: A configuration dictionary containing parameters for
data processing, feature selection, and other settings.
split: The data split to use, which can be one of "train", "tuning",
or "held_out". This determines which subset of the data is loaded and processed.
Attributes:
cfg: Configuration dictionary containing parameters for
data processing, feature selection, and other settings.
file_name_resolver: Object for resolving file names and paths based on the configuration.
split: The data split being used for loading and processing data shards.
_data_shards: List of data shard names.
valid_event_ids: Dictionary mapping shard number to a list of valid event IDs.
labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs.
codes_set: Set of codes to include in the data.
code_masks: Dictionary of code masks for filtering features based on aggregation.
num_features: Total number of features in the data.
"""

def __init__(self, cfg: DictConfig, split: str):
"""Initializes the SklearnIterator with the provided configuration and data split.
Args:
cfg: The configuration dictionary.
split: The data split to use.
"""
super().__init__(cfg=cfg, split=split)
self.valid_event_ids, self.labels = self._load_ids_and_labels()
# check if the labels are empty
if len(self.labels) == 0:
raise ValueError("No labels found.")
from .tabular_dataset import TabularDataset as SklearnIterator


class SklearnMatrix:
Expand Down
45 changes: 18 additions & 27 deletions src/MEDS_tabular_automl/tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ def __init__(self, cfg: DictConfig, split: str = "train"):
self._set_scaler()
self._set_imputer()

self.valid_event_ids, self.labels = self._load_ids_and_labels()
# check if the labels are empty
if len(self.labels) == 0:
raise ValueError("No labels found.")

@TimeableMixin.TimeAs
def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]:
"""Creates boolean masks for filtering features.
Expand Down Expand Up @@ -478,30 +483,16 @@ def extract_name(test_file):
all_feats = [all_feats[i] for i in indices]
return all_feats

def get_columns_and_indices(self) -> tuple[list[str], list[int]]:
"""Retrieves the names and indices of the columns in the data.
Returns:
A tuple containing the names of the columns and their indices.
"""
raise NotImplementedError("This method is not implemented yet.")
files = get_model_files(self.cfg, self.split, self._data_shards[0])

def extract_name(test_file):
return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem))

agg_wind_combos = [extract_name(test_file) for test_file in files]

feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp)
all_feats = []
all_indices = []
for agg_wind in agg_wind_combos:
window, feat, agg = agg_wind.split("/")
feature_ids = get_feature_indices(feat + "/" + agg, feature_columns)
feature_names = [feature_columns[i] for i in feature_ids]
for feat_name in feature_names:
all_feats.append(f"{feat_name}/{agg}/{window}")
# use mask to append indices
all_indices.extend(feature_ids)

return all_feats, all_indices
def densify(self) -> np.ndarray:
"""Builds the data as a dense matrix based on column subselection."""

# get the dense matrix by iterating through the data shards
data = []
labels = []
for shard_idx in range(len(self._data_shards)):
shard_data, shard_labels = self.get_data_shards(shard_idx)
data.append(shard_data)
labels.append(shard_labels)
data = sp.vstack(data)
labels = np.concatenate(labels, axis=0)
return data, labels
5 changes: 1 addition & 4 deletions src/MEDS_tabular_automl/xgboost_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,7 @@ def __init__(self, cfg: DictConfig, split: str):
"""
xgb.DataIter.__init__(self, cache_prefix=Path(cfg.path.cache_dir))
TabularDataset.__init__(self, cfg=cfg, split=split)
self.valid_event_ids, self.labels = self._load_ids_and_labels()
# check if the labels are empty
if self.labels is None:
raise ValueError("No labels found.")

self._it = 0

def next(self, input_data: Callable) -> int:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_tabularize.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def test_tabularize(tmp_path):
f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!"
)
output_files = list_subdir_files(str(Path(cfg.output_tabularized_dir).resolve()), "npz")
for split in split_json.keys():
for split in split_json:
for window in cfg.tabularization.window_sizes:
for agg in cfg.tabularization.aggs:
if agg.startswith("static"):
Expand Down

0 comments on commit c981534

Please sign in to comment.