From 2feee790a3b1ad0123cde536dd2d948c2789cd4e Mon Sep 17 00:00:00 2001
From: Teya Bergamaschi <teya@mit.edu>
Date: Tue, 20 Aug 2024 15:07:03 +0000
Subject: [PATCH] [wip] filtering features

---
 src/MEDS_tabular_automl/sklearn_model.py   |  2 +-
 src/MEDS_tabular_automl/tabular_dataset.py | 14 +++++++++-----
 src/MEDS_tabular_automl/utils.py           |  7 +++----
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py
index cbaa639..b660123 100644
--- a/src/MEDS_tabular_automl/sklearn_model.py
+++ b/src/MEDS_tabular_automl/sklearn_model.py
@@ -225,7 +225,7 @@ def evaluate(self, split: str = "tuning") -> float:
                 y_true.extend(labels)
             y_pred = np.array(y_pred)
             y_true = np.array(y_true)
-            
+
         # check if y_pred and y_true are not empty
         if len(y_pred) == 0 or len(y_true) == 0:
             raise ValueError("Predictions or true labels are empty.")
diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py
index 594b82c..e484598 100644
--- a/src/MEDS_tabular_automl/tabular_dataset.py
+++ b/src/MEDS_tabular_automl/tabular_dataset.py
@@ -170,12 +170,16 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]:
         codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes}
 
         if hasattr(self.cfg.tabularization, "max_by_correlation"):
-            corrs = self._get_approximate_correlation_per_feature(self.get_data_shards(0)[0], self.get_data_shards(0)[1])
+            corrs = self._get_approximate_correlation_per_feature(
+                self.get_data_shards(0)[0], self.get_data_shards(0)[1]
+            )
             corrs = np.abs(corrs)
             sorted_corrs = np.argsort(corrs)[::-1]
             codes_set = set(sorted_corrs[: self.cfg.tabularization.max_by_correlation])
         if hasattr(self.cfg.tabularization, "min_correlation"):
-            corrs = self._get_approximate_correlation_per_feature(self.get_data_shards(0)[0], self.get_data_shards(0)[1])
+            corrs = self._get_approximate_correlation_per_feature(
+                self.get_data_shards(0)[0], self.get_data_shards(0)[1]
+            )
             corrs = np.abs(corrs)
             codes_set = set(np.where(corrs > self.cfg.tabularization.min_correlation)[0])
 
@@ -184,7 +188,7 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]:
             self._get_code_masks(feature_columns, codes_set),
             len(feature_columns),
         )
-    
+
     def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarray) -> np.ndarray:
         """Calculates the approximate correlation of each feature with the target.
 
@@ -202,14 +206,14 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr
         # check that y has information
         if len(np.unique(y)) == 1:
             raise ValueError("Labels have no information. Cannot calculate correlation.")
-        
+
         from scipy.stats import pearsonr
+
         corrs = np.zeros(X.shape[1])
         for i in range(X.shape[1]):
             corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0]
         return corrs
 
-
     @TimeableMixin.TimeAs
     def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix:
         """Loads a specific data shard into memory as a sparse matrix.
diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py
index f5e6251..badb246 100644
--- a/src/MEDS_tabular_automl/utils.py
+++ b/src/MEDS_tabular_automl/utils.py
@@ -78,17 +78,16 @@ def filter_to_codes(
 
     if min_code_inclusion_frequency is not None:
         pass
-        # need to consider size of the dataset vs count 
-        
+        # need to consider size of the dataset vs count
+
         # feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency)
-        
+
     if min_code_inclusion_count is not None:
         feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count)
 
     if max_include_codes is not None:
         feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes)
 
-
     return sorted(feature_freqs["code"].to_list())