From 3ae7f241761b9d3beaf7b94dbbfa1365aaef7dc6 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 22 Jul 2024 14:55:16 +0200 Subject: [PATCH 01/20] Git - added to gitignore folder for testing reproducibility --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 93216dc..7f090fe 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ pyrightconfig.json poetry.lock notebooks/profile_autofeat.py notebooks/newtons_law_of_cooling.ipynb +notebooks/autofeat_reproducibility/* From 3fb56dc3c7526759a8eb692a7671575f85247a87 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 22 Jul 2024 15:24:37 +0200 Subject: [PATCH 02/20] Mod - modified gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7f090fe..76ac585 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,4 @@ pyrightconfig.json poetry.lock notebooks/profile_autofeat.py notebooks/newtons_law_of_cooling.ipynb -notebooks/autofeat_reproducibility/* +src/autofeat/autofeat_reproducibility/* From 11d388dc9c980a0e55fc19d82a2403f47193314c Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 22 Jul 2024 17:49:00 +0200 Subject: [PATCH 03/20] Gitignore - added folder autofeat_reproducibility --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 76ac585..38bc016 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,4 @@ pyrightconfig.json poetry.lock notebooks/profile_autofeat.py notebooks/newtons_law_of_cooling.ipynb -src/autofeat/autofeat_reproducibility/* +notebooks/autofeat_reproducibility/* \ No newline at end of file From 24c2c200c6bb227bdb1e3f8d8300265b1360f310 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 22 Jul 2024 19:57:35 +0200 Subject: [PATCH 04/20] Add - Random seeds --- src/autofeat/featsel.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index e7f90e1..7dc2ac7 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -89,7 +89,7 @@ def _noise_filtering( return good_cols -def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0) -> list: +def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int = None) -> list: """ One feature selection run. @@ -105,6 +105,11 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st """ if df.shape[0] <= 1: raise ValueError(f"n_samples = {df.shape[0]}") + + # Set random seed + if random_seed is not None: + np.random.seed(random_seed) + # initial selection of too few but (hopefully) relevant features if problem_type == "regression": model = lm.LassoLarsCV(cv=5, eps=1e-8) @@ -146,7 +151,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st if problem_type == "regression": model = lm.LassoLarsCV(cv=5, eps=1e-8) else: - model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced") + model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed) with warnings.catch_warnings(): warnings.simplefilter("ignore") # TODO: remove if sklearn least_angle issue is fixed @@ -184,6 +189,7 @@ def select_features( problem_type: str = "regression", n_jobs: int = 1, verbose: int = 0, + random_seed: int = None, ) -> list: """ Selects predictive features given the data and targets. @@ -201,6 +207,10 @@ def select_features( Returns: - good_cols: list of column names for df with which a regression model can be trained """ + # Set random seed + if random_seed is not None: + np.random.seed(random_seed) + if not (len(df) == len(target)): raise ValueError("[featsel] df and target dimension mismatch.") if keep is None: @@ -228,7 +238,7 @@ def run_select_features(i: int): logging.info(f"[featsel] Feature selection run {i + 1}/{featsel_runs}") np.random.seed(i) rand_idx = np.random.permutation(df_scaled.index)[: max(10, int(0.85 * len(df_scaled)))] - return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1) + return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=random_seed) if featsel_runs >= 1 and problem_type in ("regression", "classification"): if n_jobs == 1 or featsel_runs == 1: @@ -294,6 +304,7 @@ def __init__( keep: list | None = None, n_jobs: int = 1, verbose: int = 0, + random_seed: int = None ): """ multi-step cross-validated feature selection @@ -316,6 +327,7 @@ def __init__( self.keep = keep self.n_jobs = n_jobs self.verbose = verbose + self.random_seed = random_seed def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame): """ @@ -346,7 +358,9 @@ def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame): self.problem_type, self.n_jobs, self.verbose, + self.random_seed ) + print('Fit self.good_cols_', self.good_cols_) self.n_features_in_ = X.shape[1] return self From c1821d2671203a3919fe00f80b7c7ac1816efdf0 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Tue, 23 Jul 2024 21:36:06 +0200 Subject: [PATCH 05/20] Mod - change list to sorted (avoid randomness) --- src/autofeat/featsel.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index 7dc2ac7..33a1225 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -133,15 +133,23 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st # weight threshold: select at most 0.2*n_train initial features thr = sorted(coefs, reverse=True)[min(df.shape[1] - 1, df.shape[0] // 5)] initial_cols = list(df.columns[coefs > thr]) + #print('initial_cols before noise:', initial_cols) ## Is ok, always the same + # noise filter initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, problem_type) good_cols_set = set(initial_cols) if verbose > 0: logging.info(f"[featsel]\t {len(initial_cols)} initial features.") + + #print('initial_cols after noise:', initial_cols) ## Is ok, always the same # add noise features X_w_noise = _add_noise_features(df[initial_cols].to_numpy()) + + #print('X_w_noise:', X_w_noise) - it is always the same # go through all remaining features in splits of n_feat <= 0.5*n_train - other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols)))) + np.random.seed(42) + #other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols)))) + other_cols = list(np.random.permutation(sorted(set(df.columns).difference(initial_cols)))) if other_cols: n_splits = int(np.ceil(len(other_cols) / max(10, 0.5 * df.shape[0] - len(initial_cols)))) split_size = int(np.ceil(len(other_cols) / n_splits)) @@ -175,6 +183,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st ) # noise filtering on the combination of features good_cols = list(good_cols_set) + print('good_cols:', good_cols) good_cols = _noise_filtering(df[good_cols].to_numpy(), target, good_cols, problem_type) if verbose > 0: logging.info(f"\n[featsel]\t Selected {len(good_cols):3} features after noise filtering.") From ec9457a511d45f28288bc69b72522ef4f1d05c02 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Tue, 23 Jul 2024 21:59:12 +0200 Subject: [PATCH 06/20] Mod - fix the Parallel function --- src/autofeat/featsel.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index 33a1225..3025dd3 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -133,7 +133,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st # weight threshold: select at most 0.2*n_train initial features thr = sorted(coefs, reverse=True)[min(df.shape[1] - 1, df.shape[0] // 5)] initial_cols = list(df.columns[coefs > thr]) - #print('initial_cols before noise:', initial_cols) ## Is ok, always the same + print('initial_cols before noise:', initial_cols) ## Is ok, always the same # noise filter initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, problem_type) @@ -141,11 +141,11 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st if verbose > 0: logging.info(f"[featsel]\t {len(initial_cols)} initial features.") - #print('initial_cols after noise:', initial_cols) ## Is ok, always the same + print('initial_cols after noise:', initial_cols) ## Is ok, always the same # add noise features X_w_noise = _add_noise_features(df[initial_cols].to_numpy()) - #print('X_w_noise:', X_w_noise) - it is always the same + print('X_w_noise:', X_w_noise[:5, :5]) # go through all remaining features in splits of n_feat <= 0.5*n_train np.random.seed(42) #other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols)))) @@ -256,13 +256,19 @@ def run_select_features(i: int): for i in range(featsel_runs): selected_columns.extend(run_select_features(i)) else: - + np.random.seed(i) def flatten_lists(l: list): return [item for sublist in l for item in sublist] + + # Generate a list of seeds, one for each run + seeds = np.random.randint(0, 100000, size=featsel_runs) selected_columns = flatten_lists( - Parallel(n_jobs=n_jobs, verbose=100 * verbose)(delayed(run_select_features)(i) for i in range(featsel_runs)), - ) + Parallel(n_jobs=n_jobs, verbose=100 * verbose)( + delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs))) + + print('featsel_runs:', featsel_runs) + print('selected_columns:', selected_columns) if selected_columns: selected_columns_counter = Counter(selected_columns) From 3cae5d2b64e1832f0b5b431ee55ac54490f33b63 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Thu, 25 Jul 2024 22:08:05 +0200 Subject: [PATCH 07/20] Mod - fix reproduciblity when sorting columns --- src/autofeat/featsel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index 3025dd3..ff1c050 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -272,10 +272,10 @@ def flatten_lists(l: list): if selected_columns: selected_columns_counter = Counter(selected_columns) - # sort by frequency, but down weight longer formulas to break ties + # sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length selected_columns = sorted( selected_columns_counter, - key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)), + key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)) + np.random.random() * 0.000001, reverse=True, ) if verbose > 0: From 5727461677801610bfa6a6afb31af02f76db9ad7 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Thu, 25 Jul 2024 22:22:28 +0200 Subject: [PATCH 08/20] Mod - Random seed added to definition of run_select_features --- src/autofeat/featsel.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index ff1c050..3a62c8f 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -242,12 +242,12 @@ def select_features( # select good features in k runs in parallel # by doing sort of a cross-validation (i.e., randomly subsample data points) - def run_select_features(i: int): + def run_select_features(i: int, seed:int): if verbose > 0: logging.info(f"[featsel] Feature selection run {i + 1}/{featsel_runs}") - np.random.seed(i) + np.random.seed(seed) rand_idx = np.random.permutation(df_scaled.index)[: max(10, int(0.85 * len(df_scaled)))] - return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=random_seed) + return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=seed) if featsel_runs >= 1 and problem_type in ("regression", "classification"): if n_jobs == 1 or featsel_runs == 1: @@ -256,13 +256,12 @@ def run_select_features(i: int): for i in range(featsel_runs): selected_columns.extend(run_select_features(i)) else: - np.random.seed(i) - def flatten_lists(l: list): - return [item for sublist in l for item in sublist] - # Generate a list of seeds, one for each run seeds = np.random.randint(0, 100000, size=featsel_runs) + def flatten_lists(l: list): + return [item for sublist in l for item in sublist] + selected_columns = flatten_lists( Parallel(n_jobs=n_jobs, verbose=100 * verbose)( delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs))) From bbbfa7ee3a33564e350e7d34bbbb81e29e512f8b Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Thu, 25 Jul 2024 22:30:17 +0200 Subject: [PATCH 09/20] Mod - make consistent another seed --- src/autofeat/featsel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index 3a62c8f..cc9f1a8 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -147,7 +147,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st print('X_w_noise:', X_w_noise[:5, :5]) # go through all remaining features in splits of n_feat <= 0.5*n_train - np.random.seed(42) + np.random.seed(random_seed) #other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols)))) other_cols = list(np.random.permutation(sorted(set(df.columns).difference(initial_cols)))) if other_cols: From dcdfec0a5350c8a4600f2b7c6f50447d140b993b Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Thu, 25 Jul 2024 22:54:24 +0200 Subject: [PATCH 10/20] Add - added random seed to _noise_fintering --- src/autofeat/featsel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index cc9f1a8..9067801 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -43,6 +43,7 @@ def _noise_filtering( target: np.ndarray, good_cols: list | None = None, problem_type: str = "regression", + random_seed: int = None ) -> list: """ Trains a prediction model with additional noise features and selects only those of the @@ -65,11 +66,12 @@ def _noise_filtering( if problem_type == "regression": model = lm.LassoLarsCV(cv=5, eps=1e-8) elif problem_type == "classification": - model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced") + model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced",random_state=random_seed) else: logging.warning(f"[featsel] Unknown problem_type {problem_type} - not performing noise filtering.") model = None if model is not None: + np.random.seed(random_seed) # Set seed for noise feature addition and permutation X = _add_noise_features(X) with warnings.catch_warnings(): warnings.simplefilter("ignore") From 2a9ea6064aebcdd8715e7f0fb1ef4e042271eb09 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Thu, 25 Jul 2024 22:58:40 +0200 Subject: [PATCH 11/20] Clean - remove extra print statements --- src/autofeat/featsel.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index 9067801..7ef3586 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -135,19 +135,16 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st # weight threshold: select at most 0.2*n_train initial features thr = sorted(coefs, reverse=True)[min(df.shape[1] - 1, df.shape[0] // 5)] initial_cols = list(df.columns[coefs > thr]) - print('initial_cols before noise:', initial_cols) ## Is ok, always the same # noise filter - initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, problem_type) + initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, random_seed=random_seed) good_cols_set = set(initial_cols) if verbose > 0: logging.info(f"[featsel]\t {len(initial_cols)} initial features.") - print('initial_cols after noise:', initial_cols) ## Is ok, always the same # add noise features X_w_noise = _add_noise_features(df[initial_cols].to_numpy()) - print('X_w_noise:', X_w_noise[:5, :5]) # go through all remaining features in splits of n_feat <= 0.5*n_train np.random.seed(random_seed) #other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols)))) @@ -175,9 +172,11 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st # for classification, model.coefs_ is n_classes x n_features, but we need n_features coefs = np.abs(model.coef_) if problem_type == "regression" else np.max(np.abs(model.coef_), axis=0) weights = dict(zip(current_cols, coefs[: len(current_cols)])) + # only include features that are more important than our known noise features noise_w_thr = np.max(coefs[len(current_cols) :]) good_cols_set.update([c for c in weights if abs(weights[c]) > noise_w_thr]) + if verbose > 0: print( f"[featsel]\t Split {i + 1:2}/{n_splits}: {len(good_cols_set):3} candidate features identified.", @@ -185,7 +184,6 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st ) # noise filtering on the combination of features good_cols = list(good_cols_set) - print('good_cols:', good_cols) good_cols = _noise_filtering(df[good_cols].to_numpy(), target, good_cols, problem_type) if verbose > 0: logging.info(f"\n[featsel]\t Selected {len(good_cols):3} features after noise filtering.") @@ -256,7 +254,8 @@ def run_select_features(i: int, seed:int): # only use parallelization code if you actually parallelize selected_columns = [] for i in range(featsel_runs): - selected_columns.extend(run_select_features(i)) + selected_columns.extend(run_select_features(i, random_seed)) + else: # Generate a list of seeds, one for each run seeds = np.random.randint(0, 100000, size=featsel_runs) @@ -268,9 +267,6 @@ def flatten_lists(l: list): Parallel(n_jobs=n_jobs, verbose=100 * verbose)( delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs))) - print('featsel_runs:', featsel_runs) - print('selected_columns:', selected_columns) - if selected_columns: selected_columns_counter = Counter(selected_columns) # sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length @@ -279,6 +275,7 @@ def flatten_lists(l: list): key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)) + np.random.random() * 0.000001, reverse=True, ) + if verbose > 0: logging.info(f"[featsel] {len(selected_columns)} features after {featsel_runs} feature selection runs") # correlation filtering @@ -376,7 +373,6 @@ def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame): self.verbose, self.random_seed ) - print('Fit self.good_cols_', self.good_cols_) self.n_features_in_ = X.shape[1] return self From 306eacfb0d800059d7559dd9ba891fe303150783 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 5 Aug 2024 16:19:23 +0300 Subject: [PATCH 12/20] Format - run RUFF formatting on featset --- src/autofeat/featsel.py | 45 +++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index 7ef3586..fd97f05 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -39,11 +39,7 @@ def _add_noise_features(X: np.ndarray): def _noise_filtering( - X: np.ndarray, - target: np.ndarray, - good_cols: list | None = None, - problem_type: str = "regression", - random_seed: int = None + X: np.ndarray, target: np.ndarray, good_cols: list | None = None, problem_type: str = "regression", random_seed: int = None ) -> list: """ Trains a prediction model with additional noise features and selects only those of the @@ -66,7 +62,7 @@ def _noise_filtering( if problem_type == "regression": model = lm.LassoLarsCV(cv=5, eps=1e-8) elif problem_type == "classification": - model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced",random_state=random_seed) + model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed) else: logging.warning(f"[featsel] Unknown problem_type {problem_type} - not performing noise filtering.") model = None @@ -91,7 +87,9 @@ def _noise_filtering( return good_cols -def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int = None) -> list: +def _select_features_1run( + df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int = None +) -> list: """ One feature selection run. @@ -107,7 +105,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st """ if df.shape[0] <= 1: raise ValueError(f"n_samples = {df.shape[0]}") - + # Set random seed if random_seed is not None: np.random.seed(random_seed) @@ -147,7 +145,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st # go through all remaining features in splits of n_feat <= 0.5*n_train np.random.seed(random_seed) - #other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols)))) + # other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols)))) other_cols = list(np.random.permutation(sorted(set(df.columns).difference(initial_cols)))) if other_cols: n_splits = int(np.ceil(len(other_cols) / max(10, 0.5 * df.shape[0] - len(initial_cols)))) @@ -158,7 +156,9 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st if problem_type == "regression": model = lm.LassoLarsCV(cv=5, eps=1e-8) else: - model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed) + model = lm.LogisticRegressionCV( + cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed + ) with warnings.catch_warnings(): warnings.simplefilter("ignore") # TODO: remove if sklearn least_angle issue is fixed @@ -219,7 +219,7 @@ def select_features( # Set random seed if random_seed is not None: np.random.seed(random_seed) - + if not (len(df) == len(target)): raise ValueError("[featsel] df and target dimension mismatch.") if keep is None: @@ -242,12 +242,14 @@ def select_features( # select good features in k runs in parallel # by doing sort of a cross-validation (i.e., randomly subsample data points) - def run_select_features(i: int, seed:int): + def run_select_features(i: int, seed: int): if verbose > 0: logging.info(f"[featsel] Feature selection run {i + 1}/{featsel_runs}") np.random.seed(seed) rand_idx = np.random.permutation(df_scaled.index)[: max(10, int(0.85 * len(df_scaled)))] - return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=seed) + return _select_features_1run( + df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=seed + ) if featsel_runs >= 1 and problem_type in ("regression", "classification"): if n_jobs == 1 or featsel_runs == 1: @@ -265,8 +267,10 @@ def flatten_lists(l: list): selected_columns = flatten_lists( Parallel(n_jobs=n_jobs, verbose=100 * verbose)( - delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs))) - + delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs) + ) + ) + if selected_columns: selected_columns_counter = Counter(selected_columns) # sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length @@ -317,7 +321,7 @@ def __init__( keep: list | None = None, n_jobs: int = 1, verbose: int = 0, - random_seed: int = None + random_seed: int = None, ): """ multi-step cross-validated feature selection @@ -364,14 +368,7 @@ def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame): df = pd.DataFrame(X, columns=cols) # do the feature selection self.good_cols_ = select_features( - df, - target, - self.featsel_runs, - self.keep, - self.problem_type, - self.n_jobs, - self.verbose, - self.random_seed + df, target, self.featsel_runs, self.keep, self.problem_type, self.n_jobs, self.verbose, self.random_seed ) self.n_features_in_ = X.shape[1] return self From 77336c56d4f82482aaa7f03d2f585ed6a757cd04 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 5 Aug 2024 16:28:33 +0300 Subject: [PATCH 13/20] Mod - added separate cross validation before fitting models --- src/autofeat/featsel.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index fd97f05..7252277 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -12,6 +12,7 @@ import sklearn.linear_model as lm from joblib import Parallel, delayed from sklearn.base import BaseEstimator +from sklearn.model_selection import KFold from sklearn.utils.validation import check_array, check_is_fitted, check_X_y from autofeat.nb_utils import nb_standard_scale @@ -59,10 +60,11 @@ def _noise_filtering( good_cols = list(range(n_feat)) assert len(good_cols) == n_feat, "fewer column names provided than features in X." # perform noise filtering on these features + kf = KFold(n_splits=5, shuffle=True, random_state=random_seed) if problem_type == "regression": - model = lm.LassoLarsCV(cv=5, eps=1e-8) + model = lm.LassoLarsCV(cv=kf, eps=1e-8) elif problem_type == "classification": - model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed) + model = lm.LogisticRegressionCV(cv=kf, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed) else: logging.warning(f"[featsel] Unknown problem_type {problem_type} - not performing noise filtering.") model = None From 1e8e69f107771adf93d11fce65f4bc9cede2f434 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 5 Aug 2024 16:32:03 +0300 Subject: [PATCH 14/20] Rem - removed extra random seed --- src/autofeat/featsel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index 7252277..43ca06e 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -146,7 +146,6 @@ def _select_features_1run( X_w_noise = _add_noise_features(df[initial_cols].to_numpy()) # go through all remaining features in splits of n_feat <= 0.5*n_train - np.random.seed(random_seed) # other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols)))) other_cols = list(np.random.permutation(sorted(set(df.columns).difference(initial_cols)))) if other_cols: From b2f6c7aeec0e1d7f06984bf95eb0b5ae55ef3841 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 5 Aug 2024 16:46:10 +0300 Subject: [PATCH 15/20] Mod - solve the seed within 1run of select features --- src/autofeat/featsel.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index 43ca06e..ba8ee1f 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -243,10 +243,14 @@ def select_features( # select good features in k runs in parallel # by doing sort of a cross-validation (i.e., randomly subsample data points) - def run_select_features(i: int, seed: int): + def run_select_features(i: int, random_seed: int): if verbose > 0: logging.info(f"[featsel] Feature selection run {i + 1}/{featsel_runs}") - np.random.seed(seed) + np.random.seed(random_seed) + loop_seed = np.random.randint( + 10**6 + ) # Added to random_seed to make sure that the 1run seed is different for each run, but globally reproducible + seed = random_seed + loop_seed if random_seed is not None else loop_seed rand_idx = np.random.permutation(df_scaled.index)[: max(10, int(0.85 * len(df_scaled)))] return _select_features_1run( df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=seed From ea1f74294418b6c8c81355e47773f6348c6914de Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 5 Aug 2024 16:52:09 +0300 Subject: [PATCH 16/20] Mod - solved the random seed generator --- src/autofeat/featsel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index ba8ee1f..3c62e32 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -265,7 +265,11 @@ def run_select_features(i: int, random_seed: int): else: # Generate a list of seeds, one for each run - seeds = np.random.randint(0, 100000, size=featsel_runs) + def random_seed_generator(low=0, high=2**32 - 1): + while True: + seed = np.random.randint(low, high) + yield seed + seeds = random_seed_generator() def flatten_lists(l: list): return [item for sublist in l for item in sublist] From 73b83816f063c2bd21bf91d627e3b9c707ab8e2d Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 5 Aug 2024 17:00:27 +0300 Subject: [PATCH 17/20] Typing - fixed typing hint of random_seed --- src/autofeat/featsel.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index 3c62e32..bd27e10 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -40,7 +40,11 @@ def _add_noise_features(X: np.ndarray): def _noise_filtering( - X: np.ndarray, target: np.ndarray, good_cols: list | None = None, problem_type: str = "regression", random_seed: int = None + X: np.ndarray, + target: np.ndarray, + good_cols: list | None = None, + problem_type: str = "regression", + random_seed: int | None = None, ) -> list: """ Trains a prediction model with additional noise features and selects only those of the @@ -90,7 +94,7 @@ def _noise_filtering( def _select_features_1run( - df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int = None + df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int | None = None ) -> list: """ One feature selection run. @@ -199,7 +203,7 @@ def select_features( problem_type: str = "regression", n_jobs: int = 1, verbose: int = 0, - random_seed: int = None, + random_seed: int | None = None, ) -> list: """ Selects predictive features given the data and targets. @@ -269,6 +273,7 @@ def random_seed_generator(low=0, high=2**32 - 1): while True: seed = np.random.randint(low, high) yield seed + seeds = random_seed_generator() def flatten_lists(l: list): @@ -330,7 +335,7 @@ def __init__( keep: list | None = None, n_jobs: int = 1, verbose: int = 0, - random_seed: int = None, + random_seed: int | None = None, ): """ multi-step cross-validated feature selection From 0a026901476056cf90691f800b8535e92f509780 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Mon, 5 Aug 2024 17:04:37 +0300 Subject: [PATCH 18/20] Mod - removed extra randomness in selecting columns --- src/autofeat/featsel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index bd27e10..3cd2315 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -141,7 +141,7 @@ def _select_features_1run( initial_cols = list(df.columns[coefs > thr]) # noise filter - initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, random_seed=random_seed) + initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, problem_type, random_seed=random_seed) good_cols_set = set(initial_cols) if verbose > 0: logging.info(f"[featsel]\t {len(initial_cols)} initial features.") @@ -290,7 +290,7 @@ def flatten_lists(l: list): # sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length selected_columns = sorted( selected_columns_counter, - key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)) + np.random.random() * 0.000001, + key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)), reverse=True, ) From eff942847f584f5ac23b5b95bbbd7bb0e43314a1 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Tue, 6 Aug 2024 18:47:05 +0300 Subject: [PATCH 19/20] Mod - using KFold with all CV models; move random_seed_generator to utils.py --- src/autofeat/featsel.py | 18 ++++++++---------- src/autofeat/utils.py | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 10 deletions(-) create mode 100644 src/autofeat/utils.py diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index 3cd2315..edb412a 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -16,6 +16,7 @@ from sklearn.utils.validation import check_array, check_is_fitted, check_X_y from autofeat.nb_utils import nb_standard_scale +from autofeat.utils import random_seed_generator logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO) @@ -117,10 +118,11 @@ def _select_features_1run( np.random.seed(random_seed) # initial selection of too few but (hopefully) relevant features + kf = KFold(n_splits=5, shuffle=True, random_state=random_seed) if problem_type == "regression": - model = lm.LassoLarsCV(cv=5, eps=1e-8) + model = lm.LassoLarsCV(cv=kf, eps=1e-8) elif problem_type == "classification": - model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced") + model = lm.LogisticRegressionCV(cv=kf, penalty="l1", solver="saga", class_weight="balanced") else: logging.warning(f"[featsel] Unknown problem_type {problem_type} - not performing feature selection!") return [] @@ -158,11 +160,12 @@ def _select_features_1run( for i in range(n_splits): current_cols = other_cols[i * split_size : min(len(other_cols), (i + 1) * split_size)] X = np.hstack([df[current_cols].to_numpy(), X_w_noise]) + kf = KFold(n_splits=5, shuffle=True, random_state=random_seed) if problem_type == "regression": - model = lm.LassoLarsCV(cv=5, eps=1e-8) + model = lm.LassoLarsCV(cv=kf, eps=1e-8) else: model = lm.LogisticRegressionCV( - cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed + cv=kf, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed ) with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -269,12 +272,7 @@ def run_select_features(i: int, random_seed: int): else: # Generate a list of seeds, one for each run - def random_seed_generator(low=0, high=2**32 - 1): - while True: - seed = np.random.randint(low, high) - yield seed - - seeds = random_seed_generator() + seeds = random_seed_generator(num_seeds=featsel_runs) def flatten_lists(l: list): return [item for sublist in l for item in sublist] diff --git a/src/autofeat/utils.py b/src/autofeat/utils.py new file mode 100644 index 0000000..b129660 --- /dev/null +++ b/src/autofeat/utils.py @@ -0,0 +1,17 @@ +import numpy as np + + +def random_seed_generator(num_seeds: int, low: int = 0, high: int = 2**32 - 1): + """ + Generate a specified number of random integer seeds. + + Parameters: + num_seeds: Number of random seeds to generate. + low: Lower bound for random integers (default is 0). + high: Upper bound for random integers (default is 2**32 - 1). + + Returns: + list: List of random seeds. + """ + # Generate and return a list of random seeds + return [np.random.randint(low, high) for _ in range(num_seeds)] From de21a01151817606af31aa3b82fc87185c7eb9e6 Mon Sep 17 00:00:00 2001 From: jtimko16 Date: Thu, 8 Aug 2024 17:24:50 +0300 Subject: [PATCH 20/20] Mod - replaced custom function by np.random.default_rng(); fixed the comment --- src/autofeat/featsel.py | 6 +++--- src/autofeat/utils.py | 17 ----------------- 2 files changed, 3 insertions(+), 20 deletions(-) delete mode 100644 src/autofeat/utils.py diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py index edb412a..50b4ccd 100644 --- a/src/autofeat/featsel.py +++ b/src/autofeat/featsel.py @@ -16,7 +16,6 @@ from sklearn.utils.validation import check_array, check_is_fitted, check_X_y from autofeat.nb_utils import nb_standard_scale -from autofeat.utils import random_seed_generator logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO) @@ -272,7 +271,8 @@ def run_select_features(i: int, random_seed: int): else: # Generate a list of seeds, one for each run - seeds = random_seed_generator(num_seeds=featsel_runs) + generator = np.random.default_rng(seed=random_seed) + seeds = generator.integers(low=0, high=10**6, size=featsel_runs) def flatten_lists(l: list): return [item for sublist in l for item in sublist] @@ -285,7 +285,7 @@ def flatten_lists(l: list): if selected_columns: selected_columns_counter = Counter(selected_columns) - # sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length + # sort by frequency, but down weight longer formulas to break ties selected_columns = sorted( selected_columns_counter, key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)), diff --git a/src/autofeat/utils.py b/src/autofeat/utils.py deleted file mode 100644 index b129660..0000000 --- a/src/autofeat/utils.py +++ /dev/null @@ -1,17 +0,0 @@ -import numpy as np - - -def random_seed_generator(num_seeds: int, low: int = 0, high: int = 2**32 - 1): - """ - Generate a specified number of random integer seeds. - - Parameters: - num_seeds: Number of random seeds to generate. - low: Lower bound for random integers (default is 0). - high: Upper bound for random integers (default is 2**32 - 1). - - Returns: - list: List of random seeds. - """ - # Generate and return a list of random seeds - return [np.random.randint(low, high) for _ in range(num_seeds)]