From 9e5a074e8283695546728d44e197c22e5e2ef850 Mon Sep 17 00:00:00 2001
From: Daniel Grindrod <Daniel.Grindrod@evotec.com>
Date: Thu, 24 Oct 2024 16:12:35 +0100
Subject: [PATCH 1/7] fix: Fixed bug where every underlying LGBMRegressor or
 LGBMClassifier had n_estimators = 1

---
 flaml/automl/model.py              | 7 -------
 test/automl/test_classification.py | 2 +-
 test/automl/test_regression.py     | 5 ++++-
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/flaml/automl/model.py b/flaml/automl/model.py
index 12c3dca361..848563923b 100644
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -1567,13 +1567,6 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
                 # for xgboost>=1.6.0, pop callbacks to enable pickle
                 callbacks = self.params.pop("callbacks")
                 self._model.set_params(callbacks=callbacks[:-1])
-            best_iteration = (
-                getattr(self._model.get_booster(), "best_iteration", None)
-                if isinstance(self, XGBoostSklearnEstimator)
-                else self._model.best_iteration_
-            )
-            if best_iteration is not None:
-                self._model.set_params(n_estimators=best_iteration + 1)
         else:
             self._fit(X_train, y_train, **kwargs)
         train_time = time.time() - start_time
diff --git a/test/automl/test_classification.py b/test/automl/test_classification.py
index 942b5cda37..4a383829ae 100644
--- a/test/automl/test_classification.py
+++ b/test/automl/test_classification.py
@@ -492,7 +492,7 @@ def test_reproducibility_of_classification_models(estimator: str):
         "extra_tree",
         "histgb",
         "kneighbor",
-        # "lgbm",
+        "lgbm",
         # "lrl1",
         "lrl2",
         "rf",
diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py
index 0bef79990f..80756ca6fb 100644
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -345,7 +345,7 @@ def test_reproducibility_of_catboost_regression_model():
         "extra_tree",
         "histgb",
         "kneighbor",
-        # "lgbm",
+        "lgbm",
         "rf",
         "xgboost",
         "xgb_limitdepth",
@@ -374,12 +374,15 @@ def test_reproducibility_of_underlying_regression_models(estimator: str):
         "metric": "r2",
         "keep_search_state": True,
         "skip_transform": True,
+        "retrain_full": False,
     }
     X, y = fetch_california_housing(return_X_y=True, as_frame=True)
     automl.fit(X_train=X, y_train=y, **automl_settings)
     best_model = automl.model
     assert best_model is not None
     val_loss_flaml = automl.best_result["val_loss"]
+    print("best_model", best_model.get_params())
+    print("underlying model", best_model.model.get_params())
     reproduced_val_loss_underlying_model = np.mean(
         evaluate_cv_folds_with_underlying_model(
             automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "regression"

From 8f4b13af3cacb32dd17ca0da45c831986201de94 Mon Sep 17 00:00:00 2001
From: Daniel Grindrod <Daniel.Grindrod@evotec.com>
Date: Thu, 24 Oct 2024 17:15:59 +0100
Subject: [PATCH 2/7] test: Added test showing case where FLAMLised
 CatBoostModel result isn't reproducible

---
 test/automl/test_regression.py | 48 ++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py
index 80756ca6fb..6ba33efe7d 100644
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -338,6 +338,52 @@ def test_reproducibility_of_catboost_regression_model():
     assert pytest.approx(val_loss_flaml) == reproduced_val_loss
 
 
+def test_reproducibility_of_lgbm_regression_model():
+    """FLAML finds the best model for a given dataset, which it then provides to users.
+
+    However, there are reported issues around LGBMs - see here:
+    https://github.com/microsoft/FLAML/issues/1368
+    In this test we take the best LGB regression model which FLAML provided us, and then retrain and test it on the
+    same folds, to verify that the result is reproducible.
+    """
+    automl = AutoML()
+    automl_settings = {
+        "time_budget": 3,
+        "task": "regression",
+        "n_jobs": 1,
+        "estimator_list": ["lgbm"],
+        "eval_method": "cv",
+        "n_splits": 9,
+        "metric": "r2",
+        "keep_search_state": True,
+        "skip_transform": True,
+        "retrain_full": True,
+    }
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    automl.fit(X_train=X, y_train=y, **automl_settings)
+    best_model = automl.model
+    assert best_model is not None
+    config = best_model.get_params()
+    val_loss_flaml = automl.best_result["val_loss"]
+
+    # Take the best model, and see if we can reproduce the best result
+    reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
+        config=config,
+        estimator=best_model,
+        X_train_all=automl._state.X_train_all,
+        y_train_all=automl._state.y_train_all,
+        budget=None,
+        kf=automl._state.kf,
+        eval_metric="r2",
+        best_val_loss=None,
+        cv_score_agg_func=None,
+        log_training_metric=False,
+        fit_kwargs=None,
+        free_mem_ratio=0,
+    )
+    assert pytest.approx(val_loss_flaml) == reproduced_val_loss
+
+
 @pytest.mark.parametrize(
     "estimator",
     [
@@ -381,8 +427,6 @@ def test_reproducibility_of_underlying_regression_models(estimator: str):
     best_model = automl.model
     assert best_model is not None
     val_loss_flaml = automl.best_result["val_loss"]
-    print("best_model", best_model.get_params())
-    print("underlying model", best_model.model.get_params())
     reproduced_val_loss_underlying_model = np.mean(
         evaluate_cv_folds_with_underlying_model(
             automl._state.X_train_all, automl._state.y_train_all, automl._state.kf, best_model.model, "regression"

From f90d3ab49c073ba545f76e04f0835943e35dd366 Mon Sep 17 00:00:00 2001
From: Daniel Grindrod <Daniel.Grindrod@evotec.com>
Date: Thu, 24 Oct 2024 17:28:11 +0100
Subject: [PATCH 3/7] fix: Fixing issue where callbacks cause LGBM results to
 not be reproducible

---
 flaml/automl/model.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/flaml/automl/model.py b/flaml/automl/model.py
index 848563923b..3057538055 100644
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -1581,8 +1581,6 @@ def _callback(self, start_time, deadline, free_mem_ratio, env) -> None:
         now = time.time()
         if env.iteration == 0:
             self._time_per_iter = now - start_time
-        if now + self._time_per_iter > deadline:
-            raise EarlyStopException(env.iteration, env.evaluation_result_list)
         if psutil is not None:
             mem = psutil.virtual_memory()
             if mem.available / mem.total < free_mem_ratio:

From d43b8c1f19c37dc20db650f75ef605144e1ffe6a Mon Sep 17 00:00:00 2001
From: Daniel Grindrod <dannycg1996@gmail.com>
Date: Tue, 29 Oct 2024 11:02:43 +0000
Subject: [PATCH 4/7] Update test/automl/test_regression.py

Co-authored-by: Li Jiang <bnujli@gmail.com>
---
 test/automl/test_regression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py
index f2a2200504..eaefec4914 100644
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -344,7 +344,7 @@ def test_reproducibility_of_lgbm_regression_model():
 
     However, there are reported issues around LGBMs - see here:
     https://github.com/microsoft/FLAML/issues/1368
-    In this test we take the best LGB regression model which FLAML provided us, and then retrain and test it on the
+    In this test we take the best LGBM regression model which FLAML provided us, and then retrain and test it on the
     same folds, to verify that the result is reproducible.
     """
     automl = AutoML()

From 654194de5835741e74352a9cff2de3a51afc2efd Mon Sep 17 00:00:00 2001
From: Daniel Grindrod <Daniel.Grindrod@evotec.com>
Date: Tue, 29 Oct 2024 11:39:45 +0000
Subject: [PATCH 5/7] fix: Adding back the LGBM EarlyStopping

---
 flaml/automl/model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/flaml/automl/model.py b/flaml/automl/model.py
index e2dc67d3ef..630fe024be 100644
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -1581,6 +1581,8 @@ def _callback(self, start_time, deadline, free_mem_ratio, env) -> None:
         now = time.time()
         if env.iteration == 0:
             self._time_per_iter = now - start_time
+        if now + self._time_per_iter > deadline:
+            raise EarlyStopException(env.iteration, env.evaluation_result_list)
         if psutil is not None:
             mem = psutil.virtual_memory()
             if mem.available / mem.total < free_mem_ratio:

From 851747ce495150b5f17688b732a17e784f280e7c Mon Sep 17 00:00:00 2001
From: Daniel Grindrod <Daniel.Grindrod@evotec.com>
Date: Thu, 31 Oct 2024 10:03:09 +0000
Subject: [PATCH 6/7] refactor: Fix tweaked to ensure other models aren't
 likely to be affected

---
 flaml/automl/model.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/flaml/automl/model.py b/flaml/automl/model.py
index 630fe024be..16a09f415b 100644
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -1561,12 +1561,18 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
                     callbacks = None
             if callbacks is None:
                 self._fit(X_train, y_train, **kwargs)
-            else:
-                self._fit(X_train, y_train, callbacks=callbacks, **kwargs)
-            if callbacks is None:
                 # for xgboost>=1.6.0, pop callbacks to enable pickle
                 callbacks = self.params.pop("callbacks")
                 self._model.set_params(callbacks=callbacks[:-1])
+            else:
+                self._fit(X_train, y_train, callbacks=callbacks, **kwargs)
+            best_iteration = (
+                getattr(self._model.get_booster(), "best_iteration", None)
+                if isinstance(self, XGBoostSklearnEstimator)
+                else self._model.best_iteration_
+            )
+            if best_iteration is not None and best_iteration > 0:
+                self._model.set_params(n_estimators=best_iteration + 1)
         else:
             self._fit(X_train, y_train, **kwargs)
         train_time = time.time() - start_time

From d8dfdc7eb7c5077c5e7a2dfb02d9d1284abc2a34 Mon Sep 17 00:00:00 2001
From: Daniel Grindrod <Daniel.Grindrod@evotec.com>
Date: Thu, 31 Oct 2024 14:38:29 +0000
Subject: [PATCH 7/7] test: Fixed test to allow reproduced results to be better
 than the FLAML results, when LGBM earlystopping is involved

---
 test/automl/test_regression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py
index eaefec4914..892ad1eceb 100644
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -382,7 +382,7 @@ def test_reproducibility_of_lgbm_regression_model():
         fit_kwargs=None,
         free_mem_ratio=0,
     )
-    assert pytest.approx(val_loss_flaml) == reproduced_val_loss
+    assert pytest.approx(val_loss_flaml) == reproduced_val_loss or val_loss_flaml > reproduced_val_loss
 
 
 @pytest.mark.parametrize(