Skip to content

Commit

Permalink
Added support via hydra for selecting among four imputation methods (…
Browse files Browse the repository at this point in the history
…none, mean, median, mode), and three normalization methods (none, standard_scaler, and min_max_scaler)
  • Loading branch information
Oufattole committed Aug 21, 2024
1 parent 8c54317 commit ecf9292
Show file tree
Hide file tree
Showing 12 changed files with 51 additions and 17 deletions.
1 change: 1 addition & 0 deletions src/MEDS_tabular_automl/configs/imputer/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
imputer_target: null
3 changes: 3 additions & 0 deletions src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
imputer_target:
_target_: sklearn.impute.SimpleImputer
strategy: "mean"
3 changes: 3 additions & 0 deletions src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
imputer_target:
_target_: sklearn.impute.SimpleImputer
strategy: "median"
3 changes: 3 additions & 0 deletions src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
imputer_target:
_target_: sklearn.impute.SimpleImputer
strategy: "most_frequent"
2 changes: 2 additions & 0 deletions src/MEDS_tabular_automl/configs/launch_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ defaults:
- default
- tabularization: default
- model: xgboost # This can be changed to sgd_classifier or any other model
- imputer: default
- normalization: default
- override hydra/sweeper: optuna
- override hydra/sweeper/sampler: tpe
- override hydra/launcher: joblib
Expand Down
4 changes: 4 additions & 0 deletions src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ model_target:
output_filepath: ${output_filepath}
log_dir: ${log_dir}
cache_dir: ${cache_dir}
imputer: ${model_params.iterator.imputer}
normalization: ${model_params.iterator.normalization}

model_params:
epochs: 20
Expand All @@ -19,6 +21,8 @@ model_params:
iterator:
keep_data_in_memory: True
binarize_task: True
normalization: ${normalization}
imputer: ${imputer}

hydra:
sweeper:
Expand Down
4 changes: 4 additions & 0 deletions src/MEDS_tabular_automl/configs/model/xgboost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ model_target:
output_filepath: ${output_filepath}
log_dir: ${log_dir}
cache_dir: ${cache_dir}
imputer: ${imputer}
normalization: ${normalization}
# tabularization: ${tabularization} # Ideally we should define tabularization here, but there is an issue initializing with it's resolvers.

model_params:
Expand All @@ -23,6 +25,8 @@ model_params:
iterator:
keep_data_in_memory: True
binarize_task: True
normalization: ${normalization}
imputer: ${imputer}

hydra:
sweeper:
Expand Down
1 change: 1 addition & 0 deletions src/MEDS_tabular_automl/configs/normalization/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
normalizer: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
normalizer:
_target_: sklearn.preprocessing.MinMaxScaler
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
normalizer:
_target_: sklearn.preprocessing.StandardScaler
with_mean: False # This preserves the sparsity of the input data.
34 changes: 19 additions & 15 deletions src/MEDS_tabular_automl/tabular_dataset.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from collections.abc import Mapping
from pathlib import Path

import hydra
import numpy as np
import polars as pl
import scipy.sparse as sp
from mixins import TimeableMixin
from omegaconf import DictConfig
from scipy.stats import pearsonr

from .describe_codes import get_feature_columns
from .file_name import get_model_files, list_subdir_files
Expand Down Expand Up @@ -173,19 +173,27 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]:
allowed_codes = set(self.cfg.tabularization._resolved_codes)
codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes}

if hasattr(self.cfg.tabularization, "max_by_correlation"):
if (
hasattr(self.cfg.tabularization, "max_by_correlation")
and self.cfg.tabularization.max_by_correlation
):
corrs = self._get_approximate_correlation_per_feature(
self.get_data_shards(0)[0], self.get_data_shards(0)[1]
)
corrs = np.abs(corrs)
sorted_corrs = np.argsort(corrs)[::-1]
codes_set = set(sorted_corrs[: self.cfg.tabularization.max_by_correlation])
if hasattr(self.cfg.tabularization, "min_correlation"):

codes_set = codes_set.intersection(
set(sorted_corrs[: self.cfg.tabularization.max_by_correlation])
)
if hasattr(self.cfg.tabularization, "min_correlation") and self.cfg.tabularization.min_correlation:
corrs = self._get_approximate_correlation_per_feature(
self.get_data_shards(0)[0], self.get_data_shards(0)[1]
)
corrs = np.abs(corrs)
codes_set = set(np.where(corrs > self.cfg.tabularization.min_correlation)[0])
codes_set = codes_set.intersection(
set(np.where(corrs > self.cfg.tabularization.min_correlation)[0])
)

return (
codes_set,
Expand All @@ -209,19 +217,15 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr

# check that y has information
if len(np.unique(y)) == 1:
raise ValueError("Labels have no information. Cannot calculate correlation.")

from scipy.stats import pearsonr
raise ValueError("Labels have only one unique value. Cannot calculate correlation.")

corrs = np.zeros(X.shape[1])
for i in range(X.shape[1]):
corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0]
corrs = np.apply_along_axis(lambda col: pearsonr(col.flatten(), y)[0], 0, X.toarray())
return corrs

def _set_imputer(self):
"""Sets the imputer for the data."""
if hasattr(self.cfg.model_params.iterator, "impute"):
imputer = hydra.utils.instantiate(self.cfg.model_params.iterator.imputer)
if self.cfg.model_params.iterator.imputer.imputer_target:
imputer = self.cfg.model_params.iterator.imputer.imputer_target
if hasattr(imputer, "partial_fit"):
for i in range(len(self._data_shards)):
X, _ = self.get_data_shards(i)
Expand All @@ -236,8 +240,8 @@ def _set_imputer(self):

def _set_scaler(self):
"""Sets the scaler for the data."""
if hasattr(self.cfg.model_params.iterator, "scaler"):
scaler = hydra.utils.instantiate(self.cfg.model_params.iterator.scaler)
if self.cfg.model_params.iterator.normalization.normalizer:
scaler = self.cfg.model_params.iterator.normalization.normalizer
if hasattr(scaler, "partial_fit"):
for i in range(len(self._data_shards)):
X, _ = self.get_data_shards(i)
Expand Down
8 changes: 6 additions & 2 deletions tests/test_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def make_config_mutable(cfg):


@pytest.mark.parametrize("model", ["xgboost", "sgd_classifier"])
def test_model_config(model):
@pytest.mark.parametrize("imputer", ["default", "mean_imputer", "mode_imputer", "median_imputer"])
@pytest.mark.parametrize("normalization", ["min_max_scaler", "standard_scaler"])
def test_model_config(model, imputer, normalization):
MEDS_cohort_dir = "blah"
xgboost_config_kwargs = {
"MEDS_cohort_dir": MEDS_cohort_dir,
Expand All @@ -53,7 +55,9 @@ def test_model_config(model):
with initialize(
version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
): # path to config.yaml
overrides = [f"model={model}"] + [f"{k}={v}" for k, v in xgboost_config_kwargs.items()]
overrides = [f"model={model}", f"imputer={imputer}", f"normalization={normalization}"] + [
f"{k}={v}" for k, v in xgboost_config_kwargs.items()
]
cfg = compose(
config_name="launch_model", overrides=overrides, return_hydra_config=True
) # config.yaml
Expand Down

0 comments on commit ecf9292

Please sign in to comment.