From 2feee790a3b1ad0123cde536dd2d948c2789cd4e Mon Sep 17 00:00:00 2001 From: Teya Bergamaschi Date: Tue, 20 Aug 2024 15:07:03 +0000 Subject: [PATCH] [wip] filtering features --- src/MEDS_tabular_automl/sklearn_model.py | 2 +- src/MEDS_tabular_automl/tabular_dataset.py | 14 +++++++++----- src/MEDS_tabular_automl/utils.py | 7 +++---- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index cbaa639..b660123 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -225,7 +225,7 @@ def evaluate(self, split: str = "tuning") -> float: y_true.extend(labels) y_pred = np.array(y_pred) y_true = np.array(y_true) - + # check if y_pred and y_true are not empty if len(y_pred) == 0 or len(y_true) == 0: raise ValueError("Predictions or true labels are empty.") diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 594b82c..e484598 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -170,12 +170,16 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]: codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes} if hasattr(self.cfg.tabularization, "max_by_correlation"): - corrs = self._get_approximate_correlation_per_feature(self.get_data_shards(0)[0], self.get_data_shards(0)[1]) + corrs = self._get_approximate_correlation_per_feature( + self.get_data_shards(0)[0], self.get_data_shards(0)[1] + ) corrs = np.abs(corrs) sorted_corrs = np.argsort(corrs)[::-1] codes_set = set(sorted_corrs[: self.cfg.tabularization.max_by_correlation]) if hasattr(self.cfg.tabularization, "min_correlation"): - corrs = self._get_approximate_correlation_per_feature(self.get_data_shards(0)[0], self.get_data_shards(0)[1]) + corrs = self._get_approximate_correlation_per_feature( + self.get_data_shards(0)[0], self.get_data_shards(0)[1] + ) corrs = np.abs(corrs) codes_set = set(np.where(corrs > self.cfg.tabularization.min_correlation)[0]) @@ -184,7 +188,7 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]: self._get_code_masks(feature_columns, codes_set), len(feature_columns), ) - + def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarray) -> np.ndarray: """Calculates the approximate correlation of each feature with the target. @@ -202,14 +206,14 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr # check that y has information if len(np.unique(y)) == 1: raise ValueError("Labels have no information. Cannot calculate correlation.") - + from scipy.stats import pearsonr + corrs = np.zeros(X.shape[1]) for i in range(X.shape[1]): corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0] return corrs - @TimeableMixin.TimeAs def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix: """Loads a specific data shard into memory as a sparse matrix. diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index f5e6251..badb246 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -78,17 +78,16 @@ def filter_to_codes( if min_code_inclusion_frequency is not None: pass - # need to consider size of the dataset vs count - + # need to consider size of the dataset vs count + # feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency) - + if min_code_inclusion_count is not None: feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count) if max_include_codes is not None: feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes) - return sorted(feature_freqs["code"].to_list())