From 39edc08907ffa9a8dc9017cb6cb6c86da9e08707 Mon Sep 17 00:00:00 2001
From: Paul Koch <code@koch.ninja>
Date: Fri, 3 Jan 2025 13:22:43 -0800
Subject: [PATCH] eliminate min_cat_hessian_percent

---
 R/src/interpret_R.cpp                             |  3 +--
 python/interpret-core/interpret/develop.py        |  1 -
 .../interpret/glassbox/_ebm/_boost.py             |  6 ------
 python/interpret-core/interpret/utils/_native.py  |  5 -----
 shared/libebm/GenerateTermUpdate.cpp              | 15 ---------------
 shared/libebm/PartitionOneDimensionalBoosting.cpp | 10 +---------
 shared/libebm/inc/libebm.h                        |  1 -
 shared/libebm/tests/boosting_unusual_inputs.cpp   | 12 ------------
 shared/libebm/tests/libebm_test.cpp               |  2 --
 shared/libebm/tests/libebm_test.hpp               |  2 --
 10 files changed, 2 insertions(+), 55 deletions(-)

diff --git a/R/src/interpret_R.cpp b/R/src/interpret_R.cpp
index a4bbf9e49..c0fd4c6f0 100644
--- a/R/src/interpret_R.cpp
+++ b/R/src/interpret_R.cpp
@@ -836,9 +836,8 @@ SEXP GenerateTermUpdate_R(
       0,
       0,
       0,
-      0.0,
       10.0,
-      32,
+      9223372036854775807,
       1.0,
       aLeavesMax,
       nullptr,
diff --git a/python/interpret-core/interpret/develop.py b/python/interpret-core/interpret/develop.py
index 48244447c..5c8b94803 100644
--- a/python/interpret-core/interpret/develop.py
+++ b/python/interpret-core/interpret/develop.py
@@ -17,7 +17,6 @@
     "cat_l2": 0.0,
     "min_samples_leaf_nominal": None,
     "min_cat_samples": 10,
-    "min_cat_hessian_percent": 0.0,
     "cat_smooth": math.inf,  # math.inf means use only the gradient for sorting
     "max_cat_threshold": 9223372036854775807,
     "cat_include": 1.0,
diff --git a/python/interpret-core/interpret/glassbox/_ebm/_boost.py b/python/interpret-core/interpret/glassbox/_ebm/_boost.py
index 018f493cc..c658d827e 100644
--- a/python/interpret-core/interpret/glassbox/_ebm/_boost.py
+++ b/python/interpret-core/interpret/glassbox/_ebm/_boost.py
@@ -79,9 +79,6 @@ def boost(
                     reg_lambda=reg_lambda,
                     max_delta_step=0.0,
                     min_cat_samples=develop.get_option("min_cat_samples"),
-                    min_cat_hessian_percent=develop.get_option(
-                        "min_cat_hessian_percent"
-                    ),
                     cat_smooth=develop.get_option("cat_smooth"),
                     max_cat_threshold=develop.get_option("max_cat_threshold"),
                     cat_include=develop.get_option("cat_include"),
@@ -190,9 +187,6 @@ def boost(
                         reg_lambda=reg_lambda_local,
                         max_delta_step=max_delta_step,
                         min_cat_samples=develop.get_option("min_cat_samples"),
-                        min_cat_hessian_percent=develop.get_option(
-                            "min_cat_hessian_percent"
-                        ),
                         cat_smooth=develop.get_option("cat_smooth"),
                         max_cat_threshold=develop.get_option("max_cat_threshold"),
                         cat_include=develop.get_option("cat_include"),
diff --git a/python/interpret-core/interpret/utils/_native.py b/python/interpret-core/interpret/utils/_native.py
index d618b6577..17eb9a693 100644
--- a/python/interpret-core/interpret/utils/_native.py
+++ b/python/interpret-core/interpret/utils/_native.py
@@ -1511,8 +1511,6 @@ def _initialize(self, is_debug):
             ct.c_double,
             # int64_t minCategorySamples
             ct.c_int64,
-            # double minCategoryHessianPercent
-            ct.c_double,
             # double categoricalSmoothing
             ct.c_double,
             # int64_t maxCategoricalThreshold
@@ -1835,7 +1833,6 @@ def generate_term_update(
         reg_lambda,
         max_delta_step,
         min_cat_samples,
-        min_cat_hessian_percent,
         cat_smooth,
         max_cat_threshold,
         cat_include,
@@ -1855,7 +1852,6 @@ def generate_term_update(
             reg_lambda: L2 regularization.
             max_delta_step: Used to limit the max output of tree leaves. <=0.0 means no constraint.
             min_cat_samples: Min samples to consider category independently
-            min_cat_hessian_percent: Min percentage of the hessians to consider category independently
             cat_smooth: Parameter used to determine which categories are included each boosting round and ordering.
             max_cat_threshold: max number of categories to include each boosting round
             cat_include: percentage of categories to include in each boosting round
@@ -1904,7 +1900,6 @@ def generate_term_update(
             reg_lambda,
             max_delta_step,
             min_cat_samples,
-            min_cat_hessian_percent,
             cat_smooth,
             max_cat_threshold,
             cat_include,
diff --git a/shared/libebm/GenerateTermUpdate.cpp b/shared/libebm/GenerateTermUpdate.cpp
index 60e411fd9..c04bdffad 100644
--- a/shared/libebm/GenerateTermUpdate.cpp
+++ b/shared/libebm/GenerateTermUpdate.cpp
@@ -89,7 +89,6 @@ extern ErrorEbm PartitionOneDimensionalBoosting(RandomDeterministic* const pRng,
       const FloatCalc regLambda,
       const FloatCalc deltaStepMax,
       const size_t cCategorySamplesMin,
-      const FloatCalc categoryHessianPercentMin,
       const FloatCalc categoricalSmoothing,
       const size_t categoricalThresholdMax,
       const FloatCalc categoricalInclusionPercent,
@@ -217,7 +216,6 @@ static ErrorEbm BoostSingleDimensional(RandomDeterministic* const pRng,
       const FloatCalc regLambda,
       const FloatCalc deltaStepMax,
       const size_t cCategorySamplesMin,
-      const FloatCalc categoryHessianPercentMin,
       const FloatCalc categoricalSmoothing,
       const size_t categoricalThresholdMax,
       const FloatCalc categoricalInclusionPercent,
@@ -250,7 +248,6 @@ static ErrorEbm BoostSingleDimensional(RandomDeterministic* const pRng,
          regLambda,
          deltaStepMax,
          cCategorySamplesMin,
-         categoryHessianPercentMin,
          categoricalSmoothing,
          categoricalThresholdMax,
          categoricalInclusionPercent,
@@ -662,7 +659,6 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng,
       double regLambda,
       double maxDeltaStep,
       IntEbm minCategorySamples,
-      double minCategoryHessianPercent,
       double categoricalSmoothing,
       IntEbm maxCategoricalThreshold,
       double categoricalInclusionPercent,
@@ -686,7 +682,6 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng,
          "regLambda=%le, "
          "maxDeltaStep=%le, "
          "minCategorySamples=%" IntEbmPrintf ", "
-         "minCategoryHessianPercent=%le, "
          "categoricalSmoothing=%le, "
          "maxCategoricalThreshold=%" IntEbmPrintf ", "
          "categoricalInclusionPercent=%le, "
@@ -704,7 +699,6 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng,
          regLambda,
          maxDeltaStep,
          minCategorySamples,
-         minCategoryHessianPercent,
          categoricalSmoothing,
          maxCategoricalThreshold,
          categoricalInclusionPercent,
@@ -838,14 +832,6 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng,
       LOG_0(Trace_Warning, "WARNING GenerateTermUpdate minSamplesLeaf can't be less than 0.  Adjusting to 0.");
    }
 
-   FloatCalc categoryHessianPercentMin = static_cast<FloatCalc>(minCategoryHessianPercent);
-   if(/* NaN */ !(0.0 <= categoryHessianPercentMin)) {
-      categoryHessianPercentMin = 0.0;
-      LOG_0(Trace_Warning,
-            "WARNING GenerateTermUpdate minCategoryHessianPercent must be a positive number. Adjusting to minimum "
-            "float");
-   }
-
    FloatCalc categoricalSmoothingCalc = static_cast<FloatCalc>(categoricalSmoothing);
    if(categoricalSmoothingCalc < std::numeric_limits<FloatCalc>::min()) {
       // allow isnan(categoricalSmoothingCalc) through unscathed
@@ -1281,7 +1267,6 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng,
                      regLambdaCalc,
                      deltaStepMax,
                      cCategorySamplesMin,
-                     categoryHessianPercentMin,
                      categoricalSmoothingCalc,
                      categoricalThresholdMax,
                      categoricalInclusionPercentCalc,
diff --git a/shared/libebm/PartitionOneDimensionalBoosting.cpp b/shared/libebm/PartitionOneDimensionalBoosting.cpp
index 8b10ed9a4..cf27433dd 100644
--- a/shared/libebm/PartitionOneDimensionalBoosting.cpp
+++ b/shared/libebm/PartitionOneDimensionalBoosting.cpp
@@ -963,7 +963,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
          const FloatCalc regLambda,
          const FloatCalc deltaStepMax,
          const size_t cCategorySamplesMin,
-         const FloatCalc categoryHessianPercentMin,
          const FloatCalc categoricalSmoothing,
          const size_t categoricalThresholdMax,
          const FloatCalc categoricalInclusionPercent,
@@ -979,9 +978,8 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
 
       ErrorEbm error;
 
-      // TODO: use all of these!
+      // TODO: mirror the bMissing option for bUnseen
       UNUSED(bUnseen);
-      UNUSED(categoryHessianPercentMin);
 
       BoosterCore* const pBoosterCore = pBoosterShell->GetBoosterCore();
       const size_t cScores = GET_COUNT_SCORES(cCompilerScores, pBoosterCore->GetCountScores());
@@ -1420,7 +1418,6 @@ extern ErrorEbm PartitionOneDimensionalBoosting(RandomDeterministic* const pRng,
       const FloatCalc regLambda,
       const FloatCalc deltaStepMax,
       const size_t cCategorySamplesMin,
-      const FloatCalc categoryHessianPercentMin,
       const FloatCalc categoricalSmoothing,
       const size_t categoricalThresholdMax,
       const FloatCalc categoricalInclusionPercent,
@@ -1453,7 +1450,6 @@ extern ErrorEbm PartitionOneDimensionalBoosting(RandomDeterministic* const pRng,
                regLambda,
                deltaStepMax,
                cCategorySamplesMin,
-               categoryHessianPercentMin,
                categoricalSmoothing,
                categoricalThresholdMax,
                categoricalInclusionPercent,
@@ -1478,7 +1474,6 @@ extern ErrorEbm PartitionOneDimensionalBoosting(RandomDeterministic* const pRng,
                regLambda,
                deltaStepMax,
                cCategorySamplesMin,
-               categoryHessianPercentMin,
                categoricalSmoothing,
                categoricalThresholdMax,
                categoricalInclusionPercent,
@@ -1503,7 +1498,6 @@ extern ErrorEbm PartitionOneDimensionalBoosting(RandomDeterministic* const pRng,
                regLambda,
                deltaStepMax,
                cCategorySamplesMin,
-               categoryHessianPercentMin,
                categoricalSmoothing,
                categoricalThresholdMax,
                categoricalInclusionPercent,
@@ -1529,7 +1523,6 @@ extern ErrorEbm PartitionOneDimensionalBoosting(RandomDeterministic* const pRng,
                regLambda,
                deltaStepMax,
                cCategorySamplesMin,
-               categoryHessianPercentMin,
                categoricalSmoothing,
                categoricalThresholdMax,
                categoricalInclusionPercent,
@@ -1554,7 +1547,6 @@ extern ErrorEbm PartitionOneDimensionalBoosting(RandomDeterministic* const pRng,
                regLambda,
                deltaStepMax,
                cCategorySamplesMin,
-               categoryHessianPercentMin,
                categoricalSmoothing,
                categoricalThresholdMax,
                categoricalInclusionPercent,
diff --git a/shared/libebm/inc/libebm.h b/shared/libebm/inc/libebm.h
index 35bee9dd2..1e425a2d9 100644
--- a/shared/libebm/inc/libebm.h
+++ b/shared/libebm/inc/libebm.h
@@ -466,7 +466,6 @@ EBM_API_INCLUDE ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng,
       double regLambda,
       double maxDeltaStep,
       IntEbm minCategorySamples,
-      double minCategoryHessianPercent,
       double categoricalSmoothing,
       IntEbm maxCategoricalThreshold,
       double categoricalInclusionPercent,
diff --git a/shared/libebm/tests/boosting_unusual_inputs.cpp b/shared/libebm/tests/boosting_unusual_inputs.cpp
index 0a7b262bc..bf5b461fa 100644
--- a/shared/libebm/tests/boosting_unusual_inputs.cpp
+++ b/shared/libebm/tests/boosting_unusual_inputs.cpp
@@ -276,7 +276,6 @@ TEST_CASE("leave one potential cut uncut, boosting, regression") {
                                        0,
                                        0,
                                        k_minCategorySamplesDefault,
-                                       k_minCategoryHessianPercentDefault,
                                        k_categoricalSmoothingDefault,
                                        k_maxCategoricalThresholdDefault,
                                        k_categoricalInclusionPercentDefault,
@@ -556,7 +555,6 @@ TEST_CASE("one leavesMax, boosting, regression") {
                                        k_regLambdaDefault,
                                        k_maxDeltaStepDefault,
                                        k_minCategorySamplesDefault,
-                                       k_minCategoryHessianPercentDefault,
                                        k_categoricalSmoothingDefault,
                                        k_maxCategoricalThresholdDefault,
                                        k_categoricalInclusionPercentDefault,
@@ -596,7 +594,6 @@ TEST_CASE("mono-classification") {
          k_regLambdaDefault,
          k_maxDeltaStepDefault,
          k_minCategorySamplesDefault,
-         k_minCategoryHessianPercentDefault,
          k_categoricalSmoothingDefault,
          k_maxCategoricalThresholdDefault,
          k_categoricalInclusionPercentDefault,
@@ -1255,7 +1252,6 @@ TEST_CASE("Random splitting with 3 features, boosting, multiclass") {
                                              k_regLambdaDefault,
                                              k_maxDeltaStepDefault,
                                              k_minCategorySamplesDefault,
-                                             k_minCategoryHessianPercentDefault,
                                              k_categoricalSmoothingDefault,
                                              k_maxCategoricalThresholdDefault,
                                              k_categoricalInclusionPercentDefault,
@@ -1296,7 +1292,6 @@ TEST_CASE("Random splitting with 3 features, boosting, multiclass, sums") {
                                              k_regLambdaDefault,
                                              k_maxDeltaStepDefault,
                                              k_minCategorySamplesDefault,
-                                             k_minCategoryHessianPercentDefault,
                                              k_categoricalSmoothingDefault,
                                              k_maxCategoricalThresholdDefault,
                                              k_categoricalInclusionPercentDefault,
@@ -1357,7 +1352,6 @@ TEST_CASE("Random splitting, tripple with one dimension missing, multiclass") {
                                       k_regLambdaDefault,
                                       k_maxDeltaStepDefault,
                                       k_minCategorySamplesDefault,
-                                      k_minCategoryHessianPercentDefault,
                                       k_categoricalSmoothingDefault,
                                       k_maxCategoricalThresholdDefault,
                                       k_categoricalInclusionPercentDefault,
@@ -1425,7 +1419,6 @@ TEST_CASE("Random splitting, pure tripples, multiclass") {
                                       k_regLambdaDefault,
                                       k_maxDeltaStepDefault,
                                       k_minCategorySamplesDefault,
-                                      k_minCategoryHessianPercentDefault,
                                       k_categoricalSmoothingDefault,
                                       k_maxCategoricalThresholdDefault,
                                       k_categoricalInclusionPercentDefault,
@@ -1494,7 +1487,6 @@ TEST_CASE("Random splitting, pure tripples, regression") {
                                       k_regLambdaDefault,
                                       k_maxDeltaStepDefault,
                                       k_minCategorySamplesDefault,
-                                      k_minCategoryHessianPercentDefault,
                                       k_categoricalSmoothingDefault,
                                       k_maxCategoricalThresholdDefault,
                                       k_categoricalInclusionPercentDefault,
@@ -1560,7 +1552,6 @@ TEST_CASE("Random splitting, pure tripples, only 1 leaf, multiclass") {
                                       k_regLambdaDefault,
                                       k_maxDeltaStepDefault,
                                       k_minCategorySamplesDefault,
-                                      k_minCategoryHessianPercentDefault,
                                       k_categoricalSmoothingDefault,
                                       k_maxCategoricalThresholdDefault,
                                       k_categoricalInclusionPercentDefault,
@@ -1621,7 +1612,6 @@ TEST_CASE("Random splitting, no splits, binary, sums") {
                                       k_regLambdaDefault,
                                       k_maxDeltaStepDefault,
                                       k_minCategorySamplesDefault,
-                                      k_minCategoryHessianPercentDefault,
                                       k_categoricalSmoothingDefault,
                                       k_maxCategoricalThresholdDefault,
                                       k_categoricalInclusionPercentDefault,
@@ -2336,7 +2326,6 @@ static double RandomizedTesting(const AccelerationFlags acceleration) {
                const double regLambda = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
                const double maxDeltaStep = 0 == TestRand(rng, 5) ? 1.0 : 0.0;
                const IntEbm minCategorySamples = TestRand(rng, 100);
-               const double minCategoryHessianPercent = 0.0; // TODO: make random
                const double categoricalSmoothing = 10.0;
                const IntEbm maxCategoricalThreshold = 1 + TestRand(rng, cRealBins + 1);
                const double categoricalInclusionPercent = 0 == TestRand(rng, 2) ? 0.75 : 1.0;
@@ -2357,7 +2346,6 @@ static double RandomizedTesting(const AccelerationFlags acceleration) {
                                                      regLambda,
                                                      maxDeltaStep,
                                                      minCategorySamples,
-                                                     minCategoryHessianPercent,
                                                      categoricalSmoothing,
                                                      maxCategoricalThreshold,
                                                      categoricalInclusionPercent,
diff --git a/shared/libebm/tests/libebm_test.cpp b/shared/libebm/tests/libebm_test.cpp
index 80cba0bb2..c7d18e495 100644
--- a/shared/libebm/tests/libebm_test.cpp
+++ b/shared/libebm/tests/libebm_test.cpp
@@ -547,7 +547,6 @@ BoostRet TestBoost::Boost(const IntEbm indexTerm,
       const double regLambda,
       const double maxDeltaStep,
       const IntEbm minCategorySamples,
-      const double minCategoryHessianPercent,
       const double categoricalSmoothing,
       const IntEbm maxCategoricalThreshold,
       const double categoricalInclusionPercent,
@@ -569,7 +568,6 @@ BoostRet TestBoost::Boost(const IntEbm indexTerm,
          regLambda,
          maxDeltaStep,
          minCategorySamples,
-         minCategoryHessianPercent,
          categoricalSmoothing,
          maxCategoricalThreshold,
          categoricalInclusionPercent,
diff --git a/shared/libebm/tests/libebm_test.hpp b/shared/libebm/tests/libebm_test.hpp
index 16e01c4a5..8a76b2b67 100644
--- a/shared/libebm/tests/libebm_test.hpp
+++ b/shared/libebm/tests/libebm_test.hpp
@@ -295,7 +295,6 @@ static constexpr double k_regAlphaDefault = 0.0;
 static constexpr double k_regLambdaDefault = 0.0;
 static constexpr double k_maxDeltaStepDefault = 0.0;
 static constexpr IntEbm k_minCategorySamplesDefault = 0;
-static constexpr double k_minCategoryHessianPercentDefault = 0.0;
 static constexpr double k_categoricalSmoothingDefault = 10.0;
 static constexpr IntEbm k_maxCategoricalThresholdDefault = IntEbm{32};
 static constexpr double k_categoricalInclusionPercentDefault = 0.75;
@@ -492,7 +491,6 @@ class TestBoost {
          const double regLambda = k_regLambdaDefault,
          const double maxDeltaStep = k_maxDeltaStepDefault,
          IntEbm minCategorySamplesDefault = k_minCategorySamplesDefault,
-         double minCategoryHessianPercentDefault = k_minCategoryHessianPercentDefault,
          const double categoricalSmoothing = k_categoricalSmoothingDefault,
          const IntEbm maxCategoricalThreshold = k_maxCategoricalThresholdDefault,
          const double categoricalInclusionPercent = k_categoricalInclusionPercentDefault,