Added support via hydra for selecting among four imputation methods (…

…none, mean, median, mode), and three normalization methods (none, standard_scaler, and min_max_scaler)
mmcdermott · Aug 21, 2024 · ecf9292 · ecf9292
1 parent 8c54317
commit ecf9292
Show file tree

Hide file tree

Showing 12 changed files with 51 additions and 17 deletions.
diff --git a/src/MEDS_tabular_automl/configs/imputer/default.yaml b/src/MEDS_tabular_automl/configs/imputer/default.yaml
@@ -0,0 +1 @@
+imputer_target: null
diff --git a/src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml b/src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml
@@ -0,0 +1,3 @@
+imputer_target:
+  _target_: sklearn.impute.SimpleImputer
+  strategy: "mean"
diff --git a/src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml b/src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml
@@ -0,0 +1,3 @@
+imputer_target:
+  _target_: sklearn.impute.SimpleImputer
+  strategy: "median"
diff --git a/src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml b/src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml
@@ -0,0 +1,3 @@
+imputer_target:
+  _target_: sklearn.impute.SimpleImputer
+  strategy: "most_frequent"
diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml
@@ -3,6 +3,8 @@ defaults:
   - default
   - tabularization: default
   - model: xgboost # This can be changed to sgd_classifier or any other model
+  - imputer: default
+  - normalization: default
   - override hydra/sweeper: optuna
   - override hydra/sweeper/sampler: tpe
   - override hydra/launcher: joblib

diff --git a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml
@@ -9,6 +9,8 @@ model_target:
   output_filepath: ${output_filepath}
   log_dir: ${log_dir}
   cache_dir: ${cache_dir}
+  imputer: ${model_params.iterator.imputer}
+  normalization: ${model_params.iterator.normalization}
 
 model_params:
   epochs: 20
@@ -19,6 +21,8 @@ model_params:
   iterator:
     keep_data_in_memory: True
     binarize_task: True
+    normalization: ${normalization}
+    imputer: ${imputer}
 
 hydra:
   sweeper:

diff --git a/src/MEDS_tabular_automl/configs/model/xgboost.yaml b/src/MEDS_tabular_automl/configs/model/xgboost.yaml
@@ -9,6 +9,8 @@ model_target:
   output_filepath: ${output_filepath}
   log_dir: ${log_dir}
   cache_dir: ${cache_dir}
+  imputer: ${imputer}
+  normalization: ${normalization}
   # tabularization: ${tabularization} # Ideally we should define tabularization here, but there is an issue initializing with it's resolvers.
 
 model_params:
@@ -23,6 +25,8 @@ model_params:
   iterator:
     keep_data_in_memory: True
     binarize_task: True
+    normalization: ${normalization}
+    imputer: ${imputer}
 
 hydra:
   sweeper:

diff --git a/src/MEDS_tabular_automl/configs/normalization/default.yaml b/src/MEDS_tabular_automl/configs/normalization/default.yaml
@@ -0,0 +1 @@
+normalizer: null
diff --git a/src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml b/src/MEDS_tabular_automl/configs/normalization/min_max_scaler.yaml
@@ -0,0 +1,2 @@
+normalizer:
+  _target_: sklearn.preprocessing.MinMaxScaler
diff --git a/src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml b/src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml
@@ -0,0 +1,3 @@
+normalizer:
+  _target_: sklearn.preprocessing.StandardScaler
+  with_mean: False # This preserves the sparsity of the input data.
diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py
@@ -1,12 +1,12 @@
 from collections.abc import Mapping
 from pathlib import Path
 
-import hydra
 import numpy as np
 import polars as pl
 import scipy.sparse as sp
 from mixins import TimeableMixin
 from omegaconf import DictConfig
+from scipy.stats import pearsonr
 
 from .describe_codes import get_feature_columns
 from .file_name import get_model_files, list_subdir_files
@@ -173,19 +173,27 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]:
         allowed_codes = set(self.cfg.tabularization._resolved_codes)
         codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes}
 
-        if hasattr(self.cfg.tabularization, "max_by_correlation"):
+        if (
+            hasattr(self.cfg.tabularization, "max_by_correlation")
+            and self.cfg.tabularization.max_by_correlation
+        ):
             corrs = self._get_approximate_correlation_per_feature(
                 self.get_data_shards(0)[0], self.get_data_shards(0)[1]
             )
             corrs = np.abs(corrs)
             sorted_corrs = np.argsort(corrs)[::-1]
-            codes_set = set(sorted_corrs[: self.cfg.tabularization.max_by_correlation])
-        if hasattr(self.cfg.tabularization, "min_correlation"):
+
+            codes_set = codes_set.intersection(
+                set(sorted_corrs[: self.cfg.tabularization.max_by_correlation])
+            )
+        if hasattr(self.cfg.tabularization, "min_correlation") and self.cfg.tabularization.min_correlation:
             corrs = self._get_approximate_correlation_per_feature(
                 self.get_data_shards(0)[0], self.get_data_shards(0)[1]
             )
             corrs = np.abs(corrs)
-            codes_set = set(np.where(corrs > self.cfg.tabularization.min_correlation)[0])
+            codes_set = codes_set.intersection(
+                set(np.where(corrs > self.cfg.tabularization.min_correlation)[0])
+            )
 
         return (
             codes_set,
@@ -209,19 +217,15 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr
 
         # check that y has information
         if len(np.unique(y)) == 1:
-            raise ValueError("Labels have no information. Cannot calculate correlation.")
-
-        from scipy.stats import pearsonr
+            raise ValueError("Labels have only one unique value. Cannot calculate correlation.")
 
-        corrs = np.zeros(X.shape[1])
-        for i in range(X.shape[1]):
-            corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0]
+        corrs = np.apply_along_axis(lambda col: pearsonr(col.flatten(), y)[0], 0, X.toarray())
         return corrs
 
     def _set_imputer(self):
         """Sets the imputer for the data."""
-        if hasattr(self.cfg.model_params.iterator, "impute"):
-            imputer = hydra.utils.instantiate(self.cfg.model_params.iterator.imputer)
+        if self.cfg.model_params.iterator.imputer.imputer_target:
+            imputer = self.cfg.model_params.iterator.imputer.imputer_target
             if hasattr(imputer, "partial_fit"):
                 for i in range(len(self._data_shards)):
                     X, _ = self.get_data_shards(i)
@@ -236,8 +240,8 @@ def _set_imputer(self):
 
     def _set_scaler(self):
         """Sets the scaler for the data."""
-        if hasattr(self.cfg.model_params.iterator, "scaler"):
-            scaler = hydra.utils.instantiate(self.cfg.model_params.iterator.scaler)
+        if self.cfg.model_params.iterator.normalization.normalizer:
+            scaler = self.cfg.model_params.iterator.normalization.normalizer
             if hasattr(scaler, "partial_fit"):
                 for i in range(len(self._data_shards)):
                     X, _ = self.get_data_shards(i)

diff --git a/tests/test_configs.py b/tests/test_configs.py
@@ -35,7 +35,9 @@ def make_config_mutable(cfg):
 
 
 @pytest.mark.parametrize("model", ["xgboost", "sgd_classifier"])
-def test_model_config(model):
+@pytest.mark.parametrize("imputer", ["default", "mean_imputer", "mode_imputer", "median_imputer"])
+@pytest.mark.parametrize("normalization", ["min_max_scaler", "standard_scaler"])
+def test_model_config(model, imputer, normalization):
     MEDS_cohort_dir = "blah"
     xgboost_config_kwargs = {
         "MEDS_cohort_dir": MEDS_cohort_dir,
@@ -53,7 +55,9 @@ def test_model_config(model):
     with initialize(
         version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
     ):  # path to config.yaml
-        overrides = [f"model={model}"] + [f"{k}={v}" for k, v in xgboost_config_kwargs.items()]
+        overrides = [f"model={model}", f"imputer={imputer}", f"normalization={normalization}"] + [
+            f"{k}={v}" for k, v in xgboost_config_kwargs.items()
+        ]
         cfg = compose(
             config_name="launch_model", overrides=overrides, return_hydra_config=True
         )  # config.yaml
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		normalizer:
		_target_: sklearn.preprocessing.MinMaxScaler