Skip to content

Commit

Permalink
[wip] filtering features
Browse files Browse the repository at this point in the history
  • Loading branch information
teyaberg committed Aug 20, 2024
1 parent 0612730 commit 2feee79
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 10 deletions.
2 changes: 1 addition & 1 deletion src/MEDS_tabular_automl/sklearn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def evaluate(self, split: str = "tuning") -> float:
y_true.extend(labels)
y_pred = np.array(y_pred)
y_true = np.array(y_true)

# check if y_pred and y_true are not empty
if len(y_pred) == 0 or len(y_true) == 0:
raise ValueError("Predictions or true labels are empty.")
Expand Down
14 changes: 9 additions & 5 deletions src/MEDS_tabular_automl/tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,16 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]:
codes_set = {feature_dict[code] for code in feature_dict if code in allowed_codes}

if hasattr(self.cfg.tabularization, "max_by_correlation"):
corrs = self._get_approximate_correlation_per_feature(self.get_data_shards(0)[0], self.get_data_shards(0)[1])
corrs = self._get_approximate_correlation_per_feature(
self.get_data_shards(0)[0], self.get_data_shards(0)[1]
)
corrs = np.abs(corrs)
sorted_corrs = np.argsort(corrs)[::-1]
codes_set = set(sorted_corrs[: self.cfg.tabularization.max_by_correlation])
if hasattr(self.cfg.tabularization, "min_correlation"):
corrs = self._get_approximate_correlation_per_feature(self.get_data_shards(0)[0], self.get_data_shards(0)[1])
corrs = self._get_approximate_correlation_per_feature(
self.get_data_shards(0)[0], self.get_data_shards(0)[1]
)
corrs = np.abs(corrs)
codes_set = set(np.where(corrs > self.cfg.tabularization.min_correlation)[0])

Expand All @@ -184,7 +188,7 @@ def _get_code_set(self) -> tuple[set[int], Mapping[str, list[bool]], int]:
self._get_code_masks(feature_columns, codes_set),
len(feature_columns),
)

def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarray) -> np.ndarray:
"""Calculates the approximate correlation of each feature with the target.
Expand All @@ -202,14 +206,14 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr
# check that y has information
if len(np.unique(y)) == 1:
raise ValueError("Labels have no information. Cannot calculate correlation.")

from scipy.stats import pearsonr

corrs = np.zeros(X.shape[1])
for i in range(X.shape[1]):
corrs[i] = pearsonr(X[:, i].toarray().flatten(), y)[0]
return corrs


@TimeableMixin.TimeAs
def _load_dynamic_shard_from_file(self, path: Path, idx: int) -> sp.csc_matrix:
"""Loads a specific data shard into memory as a sparse matrix.
Expand Down
7 changes: 3 additions & 4 deletions src/MEDS_tabular_automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,16 @@ def filter_to_codes(

if min_code_inclusion_frequency is not None:
pass
# need to consider size of the dataset vs count
# need to consider size of the dataset vs count

# feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency)

if min_code_inclusion_count is not None:
feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count)

if max_include_codes is not None:
feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes)


return sorted(feature_freqs["code"].to_list())


Expand Down

0 comments on commit 2feee79

Please sign in to comment.