From 3ae7f241761b9d3beaf7b94dbbfa1365aaef7dc6 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 22 Jul 2024 14:55:16 +0200
Subject: [PATCH 01/20] Git - added to gitignore folder for testing
 reproducibility

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 93216dc..7f090fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ pyrightconfig.json
 poetry.lock
 notebooks/profile_autofeat.py
 notebooks/newtons_law_of_cooling.ipynb
+notebooks/autofeat_reproducibility/*

From 3fb56dc3c7526759a8eb692a7671575f85247a87 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 22 Jul 2024 15:24:37 +0200
Subject: [PATCH 02/20] Mod - modified gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 7f090fe..76ac585 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,4 +21,4 @@ pyrightconfig.json
 poetry.lock
 notebooks/profile_autofeat.py
 notebooks/newtons_law_of_cooling.ipynb
-notebooks/autofeat_reproducibility/*
+src/autofeat/autofeat_reproducibility/*

From 11d388dc9c980a0e55fc19d82a2403f47193314c Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 22 Jul 2024 17:49:00 +0200
Subject: [PATCH 03/20] Gitignore - added folder autofeat_reproducibility

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 76ac585..38bc016 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,4 +21,4 @@ pyrightconfig.json
 poetry.lock
 notebooks/profile_autofeat.py
 notebooks/newtons_law_of_cooling.ipynb
-src/autofeat/autofeat_reproducibility/*
+notebooks/autofeat_reproducibility/*
\ No newline at end of file

From 24c2c200c6bb227bdb1e3f8d8300265b1360f310 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 22 Jul 2024 19:57:35 +0200
Subject: [PATCH 04/20] Add - Random seeds

---
 src/autofeat/featsel.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index e7f90e1..7dc2ac7 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -89,7 +89,7 @@ def _noise_filtering(
     return good_cols
 
 
-def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0) -> list:
+def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int = None) -> list:
     """
     One feature selection run.
 
@@ -105,6 +105,11 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
     """
     if df.shape[0] <= 1:
         raise ValueError(f"n_samples = {df.shape[0]}")
+    
+    # Set random seed
+    if random_seed is not None:
+        np.random.seed(random_seed)
+
     # initial selection of too few but (hopefully) relevant features
     if problem_type == "regression":
         model = lm.LassoLarsCV(cv=5, eps=1e-8)
@@ -146,7 +151,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
             if problem_type == "regression":
                 model = lm.LassoLarsCV(cv=5, eps=1e-8)
             else:
-                model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced")
+                model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced",  random_state=random_seed)
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 # TODO: remove if sklearn least_angle issue is fixed
@@ -184,6 +189,7 @@ def select_features(
     problem_type: str = "regression",
     n_jobs: int = 1,
     verbose: int = 0,
+    random_seed: int = None,
 ) -> list:
     """
     Selects predictive features given the data and targets.
@@ -201,6 +207,10 @@ def select_features(
     Returns:
         - good_cols: list of column names for df with which a regression model can be trained
     """
+    # Set random seed
+    if random_seed is not None:
+        np.random.seed(random_seed)
+                       
     if not (len(df) == len(target)):
         raise ValueError("[featsel] df and target dimension mismatch.")
     if keep is None:
@@ -228,7 +238,7 @@ def run_select_features(i: int):
             logging.info(f"[featsel] Feature selection run {i + 1}/{featsel_runs}")
         np.random.seed(i)
         rand_idx = np.random.permutation(df_scaled.index)[: max(10, int(0.85 * len(df_scaled)))]
-        return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1)
+        return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=random_seed)
 
     if featsel_runs >= 1 and problem_type in ("regression", "classification"):
         if n_jobs == 1 or featsel_runs == 1:
@@ -294,6 +304,7 @@ def __init__(
         keep: list | None = None,
         n_jobs: int = 1,
         verbose: int = 0,
+        random_seed: int = None
     ):
         """
         multi-step cross-validated feature selection
@@ -316,6 +327,7 @@ def __init__(
         self.keep = keep
         self.n_jobs = n_jobs
         self.verbose = verbose
+        self.random_seed = random_seed
 
     def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame):
         """
@@ -346,7 +358,9 @@ def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame):
             self.problem_type,
             self.n_jobs,
             self.verbose,
+            self.random_seed
         )
+        print('Fit self.good_cols_', self.good_cols_)
         self.n_features_in_ = X.shape[1]
         return self
 

From c1821d2671203a3919fe00f80b7c7ac1816efdf0 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Tue, 23 Jul 2024 21:36:06 +0200
Subject: [PATCH 05/20] Mod - change list to sorted (avoid randomness)

---
 src/autofeat/featsel.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index 7dc2ac7..33a1225 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -133,15 +133,23 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
     # weight threshold: select at most 0.2*n_train initial features
     thr = sorted(coefs, reverse=True)[min(df.shape[1] - 1, df.shape[0] // 5)]
     initial_cols = list(df.columns[coefs > thr])
+    #print('initial_cols before noise:', initial_cols)  ## Is ok, always the same
+
     # noise filter
     initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, problem_type)
     good_cols_set = set(initial_cols)
     if verbose > 0:
         logging.info(f"[featsel]\t {len(initial_cols)} initial features.")
+
+    #print('initial_cols after noise:', initial_cols)  ## Is ok, always the same
     # add noise features
     X_w_noise = _add_noise_features(df[initial_cols].to_numpy())
+
+    #print('X_w_noise:', X_w_noise) - it is always the same
     # go through all remaining features in splits of n_feat <= 0.5*n_train
-    other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols))))
+    np.random.seed(42)
+    #other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols))))
+    other_cols = list(np.random.permutation(sorted(set(df.columns).difference(initial_cols))))
     if other_cols:
         n_splits = int(np.ceil(len(other_cols) / max(10, 0.5 * df.shape[0] - len(initial_cols))))
         split_size = int(np.ceil(len(other_cols) / n_splits))
@@ -175,6 +183,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
                 )
     # noise filtering on the combination of features
     good_cols = list(good_cols_set)
+    print('good_cols:', good_cols)
     good_cols = _noise_filtering(df[good_cols].to_numpy(), target, good_cols, problem_type)
     if verbose > 0:
         logging.info(f"\n[featsel]\t Selected {len(good_cols):3} features after noise filtering.")

From ec9457a511d45f28288bc69b72522ef4f1d05c02 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Tue, 23 Jul 2024 21:59:12 +0200
Subject: [PATCH 06/20] Mod - fix the Parallel function

---
 src/autofeat/featsel.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index 33a1225..3025dd3 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -133,7 +133,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
     # weight threshold: select at most 0.2*n_train initial features
     thr = sorted(coefs, reverse=True)[min(df.shape[1] - 1, df.shape[0] // 5)]
     initial_cols = list(df.columns[coefs > thr])
-    #print('initial_cols before noise:', initial_cols)  ## Is ok, always the same
+    print('initial_cols before noise:', initial_cols)  ## Is ok, always the same
 
     # noise filter
     initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, problem_type)
@@ -141,11 +141,11 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
     if verbose > 0:
         logging.info(f"[featsel]\t {len(initial_cols)} initial features.")
 
-    #print('initial_cols after noise:', initial_cols)  ## Is ok, always the same
+    print('initial_cols after noise:', initial_cols)  ## Is ok, always the same
     # add noise features
     X_w_noise = _add_noise_features(df[initial_cols].to_numpy())
 
-    #print('X_w_noise:', X_w_noise) - it is always the same
+    print('X_w_noise:', X_w_noise[:5, :5])  
     # go through all remaining features in splits of n_feat <= 0.5*n_train
     np.random.seed(42)
     #other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols))))
@@ -256,13 +256,19 @@ def run_select_features(i: int):
             for i in range(featsel_runs):
                 selected_columns.extend(run_select_features(i))
         else:
-
+            np.random.seed(i)
             def flatten_lists(l: list):
                 return [item for sublist in l for item in sublist]
+            
+            # Generate a list of seeds, one for each run
+            seeds = np.random.randint(0, 100000, size=featsel_runs)
 
             selected_columns = flatten_lists(
-                Parallel(n_jobs=n_jobs, verbose=100 * verbose)(delayed(run_select_features)(i) for i in range(featsel_runs)),
-            )
+                Parallel(n_jobs=n_jobs, verbose=100 * verbose)(
+                    delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs)))
+            
+            print('featsel_runs:', featsel_runs)
+            print('selected_columns:', selected_columns)
 
         if selected_columns:
             selected_columns_counter = Counter(selected_columns)

From 3cae5d2b64e1832f0b5b431ee55ac54490f33b63 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Thu, 25 Jul 2024 22:08:05 +0200
Subject: [PATCH 07/20] Mod - fix reproduciblity when sorting columns

---
 src/autofeat/featsel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index 3025dd3..ff1c050 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -272,10 +272,10 @@ def flatten_lists(l: list):
 
         if selected_columns:
             selected_columns_counter = Counter(selected_columns)
-            # sort by frequency, but down weight longer formulas to break ties
+            # sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length
             selected_columns = sorted(
                 selected_columns_counter,
-                key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)),
+                key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)) + np.random.random() * 0.000001,
                 reverse=True,
             )
             if verbose > 0:

From 5727461677801610bfa6a6afb31af02f76db9ad7 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Thu, 25 Jul 2024 22:22:28 +0200
Subject: [PATCH 08/20] Mod - Random seed added to definition of
 run_select_features

---
 src/autofeat/featsel.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index ff1c050..3a62c8f 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -242,12 +242,12 @@ def select_features(
 
     # select good features in k runs in parallel
     # by doing sort of a cross-validation (i.e., randomly subsample data points)
-    def run_select_features(i: int):
+    def run_select_features(i: int, seed:int):
         if verbose > 0:
             logging.info(f"[featsel] Feature selection run {i + 1}/{featsel_runs}")
-        np.random.seed(i)
+        np.random.seed(seed)
         rand_idx = np.random.permutation(df_scaled.index)[: max(10, int(0.85 * len(df_scaled)))]
-        return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=random_seed)
+        return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=seed)
 
     if featsel_runs >= 1 and problem_type in ("regression", "classification"):
         if n_jobs == 1 or featsel_runs == 1:
@@ -256,13 +256,12 @@ def run_select_features(i: int):
             for i in range(featsel_runs):
                 selected_columns.extend(run_select_features(i))
         else:
-            np.random.seed(i)
-            def flatten_lists(l: list):
-                return [item for sublist in l for item in sublist]
-            
             # Generate a list of seeds, one for each run
             seeds = np.random.randint(0, 100000, size=featsel_runs)
 
+            def flatten_lists(l: list):
+                return [item for sublist in l for item in sublist]
+
             selected_columns = flatten_lists(
                 Parallel(n_jobs=n_jobs, verbose=100 * verbose)(
                     delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs)))

From bbbfa7ee3a33564e350e7d34bbbb81e29e512f8b Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Thu, 25 Jul 2024 22:30:17 +0200
Subject: [PATCH 09/20] Mod - make consistent another seed

---
 src/autofeat/featsel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index 3a62c8f..cc9f1a8 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -147,7 +147,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
 
     print('X_w_noise:', X_w_noise[:5, :5])  
     # go through all remaining features in splits of n_feat <= 0.5*n_train
-    np.random.seed(42)
+    np.random.seed(random_seed)
     #other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols))))
     other_cols = list(np.random.permutation(sorted(set(df.columns).difference(initial_cols))))
     if other_cols:

From dcdfec0a5350c8a4600f2b7c6f50447d140b993b Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Thu, 25 Jul 2024 22:54:24 +0200
Subject: [PATCH 10/20] Add - added random seed to _noise_fintering

---
 src/autofeat/featsel.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index cc9f1a8..9067801 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -43,6 +43,7 @@ def _noise_filtering(
     target: np.ndarray,
     good_cols: list | None = None,
     problem_type: str = "regression",
+    random_seed: int = None
 ) -> list:
     """
     Trains a prediction model with additional noise features and selects only those of the
@@ -65,11 +66,12 @@ def _noise_filtering(
     if problem_type == "regression":
         model = lm.LassoLarsCV(cv=5, eps=1e-8)
     elif problem_type == "classification":
-        model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced")
+        model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced",random_state=random_seed)
     else:
         logging.warning(f"[featsel] Unknown problem_type {problem_type} - not performing noise filtering.")
         model = None
     if model is not None:
+        np.random.seed(random_seed)  # Set seed for noise feature addition and permutation
         X = _add_noise_features(X)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")

From 2a9ea6064aebcdd8715e7f0fb1ef4e042271eb09 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Thu, 25 Jul 2024 22:58:40 +0200
Subject: [PATCH 11/20] Clean - remove extra print statements

---
 src/autofeat/featsel.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index 9067801..7ef3586 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -135,19 +135,16 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
     # weight threshold: select at most 0.2*n_train initial features
     thr = sorted(coefs, reverse=True)[min(df.shape[1] - 1, df.shape[0] // 5)]
     initial_cols = list(df.columns[coefs > thr])
-    print('initial_cols before noise:', initial_cols)  ## Is ok, always the same
 
     # noise filter
-    initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, problem_type)
+    initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, random_seed=random_seed)
     good_cols_set = set(initial_cols)
     if verbose > 0:
         logging.info(f"[featsel]\t {len(initial_cols)} initial features.")
 
-    print('initial_cols after noise:', initial_cols)  ## Is ok, always the same
     # add noise features
     X_w_noise = _add_noise_features(df[initial_cols].to_numpy())
 
-    print('X_w_noise:', X_w_noise[:5, :5])  
     # go through all remaining features in splits of n_feat <= 0.5*n_train
     np.random.seed(random_seed)
     #other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols))))
@@ -175,9 +172,11 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
             # for classification, model.coefs_ is n_classes x n_features, but we need n_features
             coefs = np.abs(model.coef_) if problem_type == "regression" else np.max(np.abs(model.coef_), axis=0)
             weights = dict(zip(current_cols, coefs[: len(current_cols)]))
+
             # only include features that are more important than our known noise features
             noise_w_thr = np.max(coefs[len(current_cols) :])
             good_cols_set.update([c for c in weights if abs(weights[c]) > noise_w_thr])
+
             if verbose > 0:
                 print(
                     f"[featsel]\t Split {i + 1:2}/{n_splits}: {len(good_cols_set):3} candidate features identified.",
@@ -185,7 +184,6 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
                 )
     # noise filtering on the combination of features
     good_cols = list(good_cols_set)
-    print('good_cols:', good_cols)
     good_cols = _noise_filtering(df[good_cols].to_numpy(), target, good_cols, problem_type)
     if verbose > 0:
         logging.info(f"\n[featsel]\t Selected {len(good_cols):3} features after noise filtering.")
@@ -256,7 +254,8 @@ def run_select_features(i: int, seed:int):
             # only use parallelization code if you actually parallelize
             selected_columns = []
             for i in range(featsel_runs):
-                selected_columns.extend(run_select_features(i))
+                selected_columns.extend(run_select_features(i, random_seed))
+
         else:
             # Generate a list of seeds, one for each run
             seeds = np.random.randint(0, 100000, size=featsel_runs)
@@ -268,9 +267,6 @@ def flatten_lists(l: list):
                 Parallel(n_jobs=n_jobs, verbose=100 * verbose)(
                     delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs)))
             
-            print('featsel_runs:', featsel_runs)
-            print('selected_columns:', selected_columns)
-
         if selected_columns:
             selected_columns_counter = Counter(selected_columns)
             # sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length
@@ -279,6 +275,7 @@ def flatten_lists(l: list):
                 key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)) + np.random.random() * 0.000001,
                 reverse=True,
             )
+
             if verbose > 0:
                 logging.info(f"[featsel] {len(selected_columns)} features after {featsel_runs} feature selection runs")
             # correlation filtering
@@ -376,7 +373,6 @@ def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame):
             self.verbose,
             self.random_seed
         )
-        print('Fit self.good_cols_', self.good_cols_)
         self.n_features_in_ = X.shape[1]
         return self
 

From 306eacfb0d800059d7559dd9ba891fe303150783 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 5 Aug 2024 16:19:23 +0300
Subject: [PATCH 12/20] Format - run RUFF formatting on featset

---
 src/autofeat/featsel.py | 45 +++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index 7ef3586..fd97f05 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -39,11 +39,7 @@ def _add_noise_features(X: np.ndarray):
 
 
 def _noise_filtering(
-    X: np.ndarray,
-    target: np.ndarray,
-    good_cols: list | None = None,
-    problem_type: str = "regression",
-    random_seed: int = None
+    X: np.ndarray, target: np.ndarray, good_cols: list | None = None, problem_type: str = "regression", random_seed: int = None
 ) -> list:
     """
     Trains a prediction model with additional noise features and selects only those of the
@@ -66,7 +62,7 @@ def _noise_filtering(
     if problem_type == "regression":
         model = lm.LassoLarsCV(cv=5, eps=1e-8)
     elif problem_type == "classification":
-        model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced",random_state=random_seed)
+        model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed)
     else:
         logging.warning(f"[featsel] Unknown problem_type {problem_type} - not performing noise filtering.")
         model = None
@@ -91,7 +87,9 @@ def _noise_filtering(
     return good_cols
 
 
-def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int = None) -> list:
+def _select_features_1run(
+    df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int = None
+) -> list:
     """
     One feature selection run.
 
@@ -107,7 +105,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
     """
     if df.shape[0] <= 1:
         raise ValueError(f"n_samples = {df.shape[0]}")
-    
+
     # Set random seed
     if random_seed is not None:
         np.random.seed(random_seed)
@@ -147,7 +145,7 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
 
     # go through all remaining features in splits of n_feat <= 0.5*n_train
     np.random.seed(random_seed)
-    #other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols))))
+    # other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols))))
     other_cols = list(np.random.permutation(sorted(set(df.columns).difference(initial_cols))))
     if other_cols:
         n_splits = int(np.ceil(len(other_cols) / max(10, 0.5 * df.shape[0] - len(initial_cols))))
@@ -158,7 +156,9 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
             if problem_type == "regression":
                 model = lm.LassoLarsCV(cv=5, eps=1e-8)
             else:
-                model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced",  random_state=random_seed)
+                model = lm.LogisticRegressionCV(
+                    cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed
+                )
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 # TODO: remove if sklearn least_angle issue is fixed
@@ -219,7 +219,7 @@ def select_features(
     # Set random seed
     if random_seed is not None:
         np.random.seed(random_seed)
-                       
+
     if not (len(df) == len(target)):
         raise ValueError("[featsel] df and target dimension mismatch.")
     if keep is None:
@@ -242,12 +242,14 @@ def select_features(
 
     # select good features in k runs in parallel
     # by doing sort of a cross-validation (i.e., randomly subsample data points)
-    def run_select_features(i: int, seed:int):
+    def run_select_features(i: int, seed: int):
         if verbose > 0:
             logging.info(f"[featsel] Feature selection run {i + 1}/{featsel_runs}")
         np.random.seed(seed)
         rand_idx = np.random.permutation(df_scaled.index)[: max(10, int(0.85 * len(df_scaled)))]
-        return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=seed)
+        return _select_features_1run(
+            df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=seed
+        )
 
     if featsel_runs >= 1 and problem_type in ("regression", "classification"):
         if n_jobs == 1 or featsel_runs == 1:
@@ -265,8 +267,10 @@ def flatten_lists(l: list):
 
             selected_columns = flatten_lists(
                 Parallel(n_jobs=n_jobs, verbose=100 * verbose)(
-                    delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs)))
-            
+                    delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs)
+                )
+            )
+
         if selected_columns:
             selected_columns_counter = Counter(selected_columns)
             # sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length
@@ -317,7 +321,7 @@ def __init__(
         keep: list | None = None,
         n_jobs: int = 1,
         verbose: int = 0,
-        random_seed: int = None
+        random_seed: int = None,
     ):
         """
         multi-step cross-validated feature selection
@@ -364,14 +368,7 @@ def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame):
         df = pd.DataFrame(X, columns=cols)
         # do the feature selection
         self.good_cols_ = select_features(
-            df,
-            target,
-            self.featsel_runs,
-            self.keep,
-            self.problem_type,
-            self.n_jobs,
-            self.verbose,
-            self.random_seed
+            df, target, self.featsel_runs, self.keep, self.problem_type, self.n_jobs, self.verbose, self.random_seed
         )
         self.n_features_in_ = X.shape[1]
         return self

From 77336c56d4f82482aaa7f03d2f585ed6a757cd04 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 5 Aug 2024 16:28:33 +0300
Subject: [PATCH 13/20] Mod - added separate cross validation before fitting
 models

---
 src/autofeat/featsel.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index fd97f05..7252277 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -12,6 +12,7 @@
 import sklearn.linear_model as lm
 from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator
+from sklearn.model_selection import KFold
 from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
 
 from autofeat.nb_utils import nb_standard_scale
@@ -59,10 +60,11 @@ def _noise_filtering(
         good_cols = list(range(n_feat))
     assert len(good_cols) == n_feat, "fewer column names provided than features in X."
     # perform noise filtering on these features
+    kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)
     if problem_type == "regression":
-        model = lm.LassoLarsCV(cv=5, eps=1e-8)
+        model = lm.LassoLarsCV(cv=kf, eps=1e-8)
     elif problem_type == "classification":
-        model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed)
+        model = lm.LogisticRegressionCV(cv=kf, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed)
     else:
         logging.warning(f"[featsel] Unknown problem_type {problem_type} - not performing noise filtering.")
         model = None

From 1e8e69f107771adf93d11fce65f4bc9cede2f434 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 5 Aug 2024 16:32:03 +0300
Subject: [PATCH 14/20] Rem - removed extra random seed

---
 src/autofeat/featsel.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index 7252277..43ca06e 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -146,7 +146,6 @@ def _select_features_1run(
     X_w_noise = _add_noise_features(df[initial_cols].to_numpy())
 
     # go through all remaining features in splits of n_feat <= 0.5*n_train
-    np.random.seed(random_seed)
     # other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols))))
     other_cols = list(np.random.permutation(sorted(set(df.columns).difference(initial_cols))))
     if other_cols:

From b2f6c7aeec0e1d7f06984bf95eb0b5ae55ef3841 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 5 Aug 2024 16:46:10 +0300
Subject: [PATCH 15/20] Mod - solve the seed within 1run of select features

---
 src/autofeat/featsel.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index 43ca06e..ba8ee1f 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -243,10 +243,14 @@ def select_features(
 
     # select good features in k runs in parallel
     # by doing sort of a cross-validation (i.e., randomly subsample data points)
-    def run_select_features(i: int, seed: int):
+    def run_select_features(i: int, random_seed: int):
         if verbose > 0:
             logging.info(f"[featsel] Feature selection run {i + 1}/{featsel_runs}")
-        np.random.seed(seed)
+        np.random.seed(random_seed)
+        loop_seed = np.random.randint(
+            10**6
+        )  # Added to random_seed to make sure that the 1run seed is different for each run, but globally reproducible
+        seed = random_seed + loop_seed if random_seed is not None else loop_seed
         rand_idx = np.random.permutation(df_scaled.index)[: max(10, int(0.85 * len(df_scaled)))]
         return _select_features_1run(
             df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=seed

From ea1f74294418b6c8c81355e47773f6348c6914de Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 5 Aug 2024 16:52:09 +0300
Subject: [PATCH 16/20] Mod - solved the random seed generator

---
 src/autofeat/featsel.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index ba8ee1f..3c62e32 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -265,7 +265,11 @@ def run_select_features(i: int, random_seed: int):
 
         else:
             # Generate a list of seeds, one for each run
-            seeds = np.random.randint(0, 100000, size=featsel_runs)
+            def random_seed_generator(low=0, high=2**32 - 1):
+                while True:
+                    seed = np.random.randint(low, high)
+                    yield seed
+            seeds = random_seed_generator()
 
             def flatten_lists(l: list):
                 return [item for sublist in l for item in sublist]

From 73b83816f063c2bd21bf91d627e3b9c707ab8e2d Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 5 Aug 2024 17:00:27 +0300
Subject: [PATCH 17/20] Typing - fixed typing hint of random_seed

---
 src/autofeat/featsel.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index 3c62e32..bd27e10 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -40,7 +40,11 @@ def _add_noise_features(X: np.ndarray):
 
 
 def _noise_filtering(
-    X: np.ndarray, target: np.ndarray, good_cols: list | None = None, problem_type: str = "regression", random_seed: int = None
+    X: np.ndarray,
+    target: np.ndarray,
+    good_cols: list | None = None,
+    problem_type: str = "regression",
+    random_seed: int | None = None,
 ) -> list:
     """
     Trains a prediction model with additional noise features and selects only those of the
@@ -90,7 +94,7 @@ def _noise_filtering(
 
 
 def _select_features_1run(
-    df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int = None
+    df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int | None = None
 ) -> list:
     """
     One feature selection run.
@@ -199,7 +203,7 @@ def select_features(
     problem_type: str = "regression",
     n_jobs: int = 1,
     verbose: int = 0,
-    random_seed: int = None,
+    random_seed: int | None = None,
 ) -> list:
     """
     Selects predictive features given the data and targets.
@@ -269,6 +273,7 @@ def random_seed_generator(low=0, high=2**32 - 1):
                 while True:
                     seed = np.random.randint(low, high)
                     yield seed
+
             seeds = random_seed_generator()
 
             def flatten_lists(l: list):
@@ -330,7 +335,7 @@ def __init__(
         keep: list | None = None,
         n_jobs: int = 1,
         verbose: int = 0,
-        random_seed: int = None,
+        random_seed: int | None = None,
     ):
         """
         multi-step cross-validated feature selection

From 0a026901476056cf90691f800b8535e92f509780 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Mon, 5 Aug 2024 17:04:37 +0300
Subject: [PATCH 18/20] Mod - removed extra randomness in selecting columns

---
 src/autofeat/featsel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index bd27e10..3cd2315 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -141,7 +141,7 @@ def _select_features_1run(
     initial_cols = list(df.columns[coefs > thr])
 
     # noise filter
-    initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, random_seed=random_seed)
+    initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, problem_type, random_seed=random_seed)
     good_cols_set = set(initial_cols)
     if verbose > 0:
         logging.info(f"[featsel]\t {len(initial_cols)} initial features.")
@@ -290,7 +290,7 @@ def flatten_lists(l: list):
             # sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length
             selected_columns = sorted(
                 selected_columns_counter,
-                key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)) + np.random.random() * 0.000001,
+                key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)),
                 reverse=True,
             )
 

From eff942847f584f5ac23b5b95bbbd7bb0e43314a1 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Tue, 6 Aug 2024 18:47:05 +0300
Subject: [PATCH 19/20] Mod - using KFold with all CV models; move
 random_seed_generator to utils.py

---
 src/autofeat/featsel.py | 18 ++++++++----------
 src/autofeat/utils.py   | 17 +++++++++++++++++
 2 files changed, 25 insertions(+), 10 deletions(-)
 create mode 100644 src/autofeat/utils.py

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index 3cd2315..edb412a 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -16,6 +16,7 @@
 from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
 
 from autofeat.nb_utils import nb_standard_scale
+from autofeat.utils import random_seed_generator
 
 logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
 
@@ -117,10 +118,11 @@ def _select_features_1run(
         np.random.seed(random_seed)
 
     # initial selection of too few but (hopefully) relevant features
+    kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)
     if problem_type == "regression":
-        model = lm.LassoLarsCV(cv=5, eps=1e-8)
+        model = lm.LassoLarsCV(cv=kf, eps=1e-8)
     elif problem_type == "classification":
-        model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced")
+        model = lm.LogisticRegressionCV(cv=kf, penalty="l1", solver="saga", class_weight="balanced")
     else:
         logging.warning(f"[featsel] Unknown problem_type {problem_type} - not performing feature selection!")
         return []
@@ -158,11 +160,12 @@ def _select_features_1run(
         for i in range(n_splits):
             current_cols = other_cols[i * split_size : min(len(other_cols), (i + 1) * split_size)]
             X = np.hstack([df[current_cols].to_numpy(), X_w_noise])
+            kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)
             if problem_type == "regression":
-                model = lm.LassoLarsCV(cv=5, eps=1e-8)
+                model = lm.LassoLarsCV(cv=kf, eps=1e-8)
             else:
                 model = lm.LogisticRegressionCV(
-                    cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed
+                    cv=kf, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed
                 )
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
@@ -269,12 +272,7 @@ def run_select_features(i: int, random_seed: int):
 
         else:
             # Generate a list of seeds, one for each run
-            def random_seed_generator(low=0, high=2**32 - 1):
-                while True:
-                    seed = np.random.randint(low, high)
-                    yield seed
-
-            seeds = random_seed_generator()
+            seeds = random_seed_generator(num_seeds=featsel_runs)
 
             def flatten_lists(l: list):
                 return [item for sublist in l for item in sublist]
diff --git a/src/autofeat/utils.py b/src/autofeat/utils.py
new file mode 100644
index 0000000..b129660
--- /dev/null
+++ b/src/autofeat/utils.py
@@ -0,0 +1,17 @@
+import numpy as np
+
+
+def random_seed_generator(num_seeds: int, low: int = 0, high: int = 2**32 - 1):
+    """
+    Generate a specified number of random integer seeds.
+
+    Parameters:
+    num_seeds: Number of random seeds to generate.
+    low: Lower bound for random integers (default is 0).
+    high: Upper bound for random integers (default is 2**32 - 1).
+
+    Returns:
+    list: List of random seeds.
+    """
+    # Generate and return a list of random seeds
+    return [np.random.randint(low, high) for _ in range(num_seeds)]

From de21a01151817606af31aa3b82fc87185c7eb9e6 Mon Sep 17 00:00:00 2001
From: jtimko16 <jantimko16@gmail.com>
Date: Thu, 8 Aug 2024 17:24:50 +0300
Subject: [PATCH 20/20] Mod - replaced custom function by
 np.random.default_rng(); fixed the comment

---
 src/autofeat/featsel.py |  6 +++---
 src/autofeat/utils.py   | 17 -----------------
 2 files changed, 3 insertions(+), 20 deletions(-)
 delete mode 100644 src/autofeat/utils.py

diff --git a/src/autofeat/featsel.py b/src/autofeat/featsel.py
index edb412a..50b4ccd 100644
--- a/src/autofeat/featsel.py
+++ b/src/autofeat/featsel.py
@@ -16,7 +16,6 @@
 from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
 
 from autofeat.nb_utils import nb_standard_scale
-from autofeat.utils import random_seed_generator
 
 logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
 
@@ -272,7 +271,8 @@ def run_select_features(i: int, random_seed: int):
 
         else:
             # Generate a list of seeds, one for each run
-            seeds = random_seed_generator(num_seeds=featsel_runs)
+            generator = np.random.default_rng(seed=random_seed)
+            seeds = generator.integers(low=0, high=10**6, size=featsel_runs)
 
             def flatten_lists(l: list):
                 return [item for sublist in l for item in sublist]
@@ -285,7 +285,7 @@ def flatten_lists(l: list):
 
         if selected_columns:
             selected_columns_counter = Counter(selected_columns)
-            # sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length
+            # sort by frequency, but down weight longer formulas to break ties
             selected_columns = sorted(
                 selected_columns_counter,
                 key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)),
diff --git a/src/autofeat/utils.py b/src/autofeat/utils.py
deleted file mode 100644
index b129660..0000000
--- a/src/autofeat/utils.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import numpy as np
-
-
-def random_seed_generator(num_seeds: int, low: int = 0, high: int = 2**32 - 1):
-    """
-    Generate a specified number of random integer seeds.
-
-    Parameters:
-    num_seeds: Number of random seeds to generate.
-    low: Lower bound for random integers (default is 0).
-    high: Upper bound for random integers (default is 2**32 - 1).
-
-    Returns:
-    list: List of random seeds.
-    """
-    # Generate and return a list of random seeds
-    return [np.random.randint(low, high) for _ in range(num_seeds)]