Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-worked the tabular dataset config a bit. #91

Merged
merged 8 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
default_language_version:
python: python3.12

exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports"
python: python3.11

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down
37 changes: 0 additions & 37 deletions src/MEDS_tabular_automl/dense_iterator.py

This file was deleted.

15 changes: 1 addition & 14 deletions src/MEDS_tabular_automl/evaluation_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@
import polars as pl
from hydra.experimental.callback import Callback
from loguru import logger
from omegaconf import DictConfig, OmegaConf
from omegaconf import DictConfig


class EvaluationCallback(Callback):
def __init__(self, **kwargs):
self.kwargs = kwargs

def on_multirun_end(self, config: DictConfig, **kwargs):
"""Find best model based on log files and logger.info its performance and hyperparameters."""
log_fp = Path(config.model_logging.model_log_dir)
Expand All @@ -27,7 +24,6 @@ def on_multirun_end(self, config: DictConfig, **kwargs):

logger.info(f"The best model can be found at {best_model}")
self.log_performance(perf[0, :])
# self.log_hyperparams(log_fp / best_model / f"{config.model_logging.config_log_stem}.log")
if hasattr(config, "model_saving.delete_below_top_k") and config.delete_below_top_k >= 0:
self.delete_below_top_k_models(
perf, config.model_saving.delete_below_top_k, config.model_saving.model_dir
Expand All @@ -43,15 +39,6 @@ def log_performance(self, perf):
f"\nPerformance of best model:\nTuning AUC: {tuning_auc}\nTest AUC: {test_auc}",
)

def log_hyperparams(self, best_params_fp):
"""logger.info hyperparameters of the best model with nice formatting."""
# check if this file exists
if not best_params_fp.is_file():
raise FileNotFoundError(f"Best hyperparameters file not found at {best_params_fp}")
best_params = OmegaConf.load(best_params_fp)
# print using OmegaConf.to_yaml
logger.info(f"\nHyperparameters of the best model:\n{OmegaConf.to_yaml(best_params)}")

def delete_below_top_k_models(self, perf, k, model_dir):
"""Save only top k models from the model directory and delete all other files."""
top_k_models = perf.head(k)["model_fp"].values
Expand Down
2 changes: 1 addition & 1 deletion src/MEDS_tabular_automl/scripts/launch_autogluon.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from loguru import logger
from omegaconf import DictConfig

from MEDS_tabular_automl.dense_iterator import DenseIterator
from MEDS_tabular_automl.tabular_dataset import TabularDataset as DenseIterator

from ..utils import hydra_loguru_init

Expand Down
41 changes: 1 addition & 40 deletions src/MEDS_tabular_automl/sklearn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,7 @@
from sklearn.metrics import roc_auc_score

from .base_model import BaseModel
from .tabular_dataset import TabularDataset


class SklearnIterator(TabularDataset):
"""SklearnIterator class for loading and processing data shards for use in SciKit-Learn models.

This class provides functionality for iterating through data shards, loading
feature data and labels, and processing them based on the provided configuration.

Args:
cfg: A configuration dictionary containing parameters for
data processing, feature selection, and other settings.
split: The data split to use, which can be one of "train", "tuning",
or "held_out". This determines which subset of the data is loaded and processed.

Attributes:
cfg: Configuration dictionary containing parameters for
data processing, feature selection, and other settings.
file_name_resolver: Object for resolving file names and paths based on the configuration.
split: The data split being used for loading and processing data shards.
_data_shards: List of data shard names.
valid_event_ids: Dictionary mapping shard number to a list of valid event IDs.
labels: Dictionary mapping shard number to a list of labels for the corresponding event IDs.
codes_set: Set of codes to include in the data.
code_masks: Dictionary of code masks for filtering features based on aggregation.
num_features: Total number of features in the data.
"""

def __init__(self, cfg: DictConfig, split: str):
"""Initializes the SklearnIterator with the provided configuration and data split.

Args:
cfg: The configuration dictionary.
split: The data split to use.
"""
super().__init__(cfg=cfg, split=split)
self.valid_event_ids, self.labels = self._load_ids_and_labels()
# check if the labels are empty
if len(self.labels) == 0:
raise ValueError("No labels found.")
from .tabular_dataset import TabularDataset as SklearnIterator


class SklearnMatrix:
Expand Down
45 changes: 18 additions & 27 deletions src/MEDS_tabular_automl/tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ def __init__(self, cfg: DictConfig, split: str = "train"):
self._set_scaler()
self._set_imputer()

self.valid_event_ids, self.labels = self._load_ids_and_labels()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't load labels here since I was trying to leave it relatively flexible in case someone wanted to use this to explore the data before they had labels (originally this came up when we discussed wanting to look at data in a notebook for the clustering work). I am not necessarily extremely attached to this, but that was the thought behind it.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think given that right now all uses of this rely on labels, unless I'm misreading the code, it seems worthwhile to just do it all here, right?

# check if the labels are empty
if len(self.labels) == 0:
raise ValueError("No labels found.")

@TimeableMixin.TimeAs
def _get_code_masks(self, feature_columns: list, codes_set: set) -> Mapping[str, list[bool]]:
"""Creates boolean masks for filtering features.
Expand Down Expand Up @@ -470,30 +475,16 @@ def extract_name(test_file):
all_feats = [all_feats[i] for i in indices]
return all_feats

def get_columns_and_indices(self) -> tuple[list[str], list[int]]:
"""Retrieves the names and indices of the columns in the data.

Returns:
A tuple containing the names of the columns and their indices.
"""
raise NotImplementedError("This method is not implemented yet.")
files = get_model_files(self.cfg, self.split, self._data_shards[0])

def extract_name(test_file):
return str(Path(test_file.parent.parent.stem, test_file.parent.stem, test_file.stem))

agg_wind_combos = [extract_name(test_file) for test_file in files]

feature_columns = get_feature_columns(self.cfg.tabularization.filtered_code_metadata_fp)
all_feats = []
all_indices = []
for agg_wind in agg_wind_combos:
window, feat, agg = agg_wind.split("/")
feature_ids = get_feature_indices(feat + "/" + agg, feature_columns)
feature_names = [feature_columns[i] for i in feature_ids]
for feat_name in feature_names:
all_feats.append(f"{feat_name}/{agg}/{window}")
# use mask to append indices
all_indices.extend(feature_ids)

return all_feats, all_indices
def densify(self) -> np.ndarray:
"""Builds the data as a dense matrix based on column subselection."""

# get the dense matrix by iterating through the data shards
data = []
labels = []
for shard_idx in range(len(self._data_shards)):
shard_data, shard_labels = self.get_data_shards(shard_idx)
data.append(shard_data)
labels.append(shard_labels)
data = sp.vstack(data)
labels = np.concatenate(labels, axis=0)
return data, labels
5 changes: 1 addition & 4 deletions src/MEDS_tabular_automl/xgboost_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,7 @@ def __init__(self, cfg: DictConfig, split: str):
"""
xgb.DataIter.__init__(self, cache_prefix=Path(cfg.cache_dir))
TabularDataset.__init__(self, cfg=cfg, split=split)
self.valid_event_ids, self.labels = self._load_ids_and_labels()
# check if the labels are empty
if self.labels is None:
raise ValueError("No labels found.")

self._it = 0

def next(self, input_data: Callable) -> int:
Expand Down
Loading