Upgrade XgBoost to 1.7.x (#362)

* Upgrade XgBoost version to 1.7.3 * Restructure checkpointing.py and fix Unit tests * Resolve Flake8 style errors * Address Comments and improvements * Add comments and improvements * Upgrade xgboost to 1.7.4 * remove grow_local_histmaker and Single precision histogram --------- Co-authored-by: Malav Shastri <[email protected]>
aws · Mar 6, 2023 · 6dcd442 · 6dcd442
1 parent 3aa31e9
commit 6dcd442
Show file tree

Hide file tree

Showing 14 changed files with 167 additions and 137 deletions.
diff --git a/README.rst b/README.rst
@@ -253,4 +253,4 @@ SageMaker XGboost Framework Container is licensed under the Apache 2.0 License.
 .com, Inc. or its affiliates. All Rights Reserved. The license is available at:
 http://aws.amazon.com/apache2.0/
 
-.. |XGBoostLatestVersion| replace:: 1.5-1
+.. |XGBoostLatestVersion| replace:: 1.7-1
diff --git a/docker/1.5-1/base/Dockerfile.cpu → docker/1.7-1/base/Dockerfile.cpu b/docker/1.5-1/base/Dockerfile.cpu → docker/1.7-1/base/Dockerfile.cpu
@@ -11,7 +11,7 @@ ARG CONDA_PKG_VERSION=4.10.1
 ARG PYTHON_VERSION=3.8.13
 ARG PYARROW_VERSION=1.0
 ARG MLIO_VERSION=0.7.0
-ARG XGBOOST_VERSION=1.5.2
+ARG XGBOOST_VERSION=1.7.4
 
 ENV DEBIAN_FRONTEND=noninteractive
 ENV LANG=C.UTF-8

diff --git a/docker/1.5-1/base/Dockerfile_arm64.cpu → docker/1.7-1/base/Dockerfile_arm64.cpu b/docker/1.5-1/base/Dockerfile_arm64.cpu → docker/1.7-1/base/Dockerfile_arm64.cpu
diff --git a/docker/1.5-1/final/Dockerfile.cpu → docker/1.7-1/final/Dockerfile.cpu b/docker/1.5-1/final/Dockerfile.cpu → docker/1.7-1/final/Dockerfile.cpu
@@ -1,4 +1,4 @@
-ARG SAGEMAKER_XGBOOST_VERSION=1.5-1
+ARG SAGEMAKER_XGBOOST_VERSION=1.7-1
 ARG PYTHON_VERSION=3.8
 
 FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3

diff --git a/docker/1.5-1/final/Dockerfile_arm64.cpu → docker/1.7-1/final/Dockerfile_arm64.cpu b/docker/1.5-1/final/Dockerfile_arm64.cpu → docker/1.7-1/final/Dockerfile_arm64.cpu
diff --git a/...-1/resources/mms/ExecutionParameters.java → ...-1/resources/mms/ExecutionParameters.java b/...-1/resources/mms/ExecutionParameters.java → ...-1/resources/mms/ExecutionParameters.java
diff --git a/...1.5-1/resources/mms/config.properties.tmp → ...1.7-1/resources/mms/config.properties.tmp b/...1.5-1/resources/mms/config.properties.tmp → ...1.7-1/resources/mms/config.properties.tmp
diff --git a/docker/1.5-1/resources/mms/endpoints-1.0.jar → docker/1.7-1/resources/mms/endpoints-1.0.jar b/docker/1.5-1/resources/mms/endpoints-1.0.jar → docker/1.7-1/resources/mms/endpoints-1.0.jar
diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@ PyYAML==5.4.1
 Pillow==9.1.1
 boto3==1.17.52
 botocore==1.20.52
-cryptography==35.0.0
+cryptography==39.0.1
 dask==2022.11.1
 dask-cuda==22.12.0
 gunicorn==19.10.0

diff --git a/src/sagemaker_xgboost_container/algorithm_mode/hyperparameter_validation.py b/src/sagemaker_xgboost_container/algorithm_mode/hyperparameter_validation.py
@@ -29,7 +29,6 @@ def updater_validator(value, dependencies):
             "grow_colmaker",
             "distcol",
             "grow_histmaker",
-            "grow_local_histmaker",
             "grow_skmaker",
             "sync",
             "refresh",
@@ -40,7 +39,6 @@ def updater_validator(value, dependencies):
             "grow_colmaker",
             "distcol",
             "grow_histmaker",
-            "grow_local_histmaker",
             "grow_colmaker",
             "grow_quantile_histmaker",
         ]
@@ -62,7 +60,7 @@ def updater_validator(value, dependencies):
             if not all(x in valid_tree_plugins for x in value):
                 raise exc.UserError(
                     "Tree updater should be selected from these options: 'grow_colmaker', 'distcol', 'grow_histmaker', "
-                    "'grow_local_histmaker', 'grow_skmaker', 'grow_quantile_histmaker', 'sync', 'refresh', 'prune', "
+                    "'grow_skmaker', 'grow_quantile_histmaker', 'sync', 'refresh', 'prune', "
                     "'shortgun', 'coord_descent'."
                 )
             # validate only one tree updater is selected
@@ -74,7 +72,7 @@ def updater_validator(value, dependencies):
                 raise exc.UserError(
                     "Only one tree grow plugin can be selected. Choose one from the"
                     "following: 'grow_colmaker', 'distcol', 'grow_histmaker', "
-                    "'grow_local_histmaker', 'grow_skmaker'"
+                    "'grow_skmaker'"
                 )
 
     @hpv.range_validator(["auto", "cpu_predictor", "gpu_predictor"])
@@ -239,15 +237,13 @@ def interaction_constraints_validator(value, dependencies):
                 "grow_colmaker",
                 "distcol",
                 "grow_histmaker",
-                "grow_local_histmaker",
                 "grow_skmaker",
                 "sync",
                 "refresh",
                 "prune",
                 "grow_colmaker",
                 "distcol",
                 "grow_histmaker",
-                "grow_local_histmaker",
                 "grow_colmaker",
                 "shotgun",
                 "coord_descent",
@@ -334,7 +330,6 @@ def interaction_constraints_validator(value, dependencies):
         hpv.ContinuousHyperparameter(
             name="aft_loss_distribution_scale", range=hpv.Interval(min_closed=0), required=False
         ),
-        hpv.CategoricalHyperparameter(name="single_precision_histogram", range=["true", "false"], required=False),
         hpv.CategoricalHyperparameter(name="deterministic_histogram", range=["true", "false"], required=False),
         hpv.CategoricalHyperparameter(name="sampling_method", range=["uniform", "gradient_based"], required=False),
         hpv.IntegerHyperparameter(name="prob_buffer_row", range=hpv.Interval(min_open=1.0), required=False),

diff --git a/src/sagemaker_xgboost_container/checkpointing.py b/src/sagemaker_xgboost_container/checkpointing.py
@@ -6,9 +6,10 @@
 import threading
 
 import xgboost as xgb
+from typing import Optional
 from xgboost import rabit
-from xgboost.callback import _fmt_metric as format_metric
-from xgboost.core import Booster, XGBoostError
+from xgboost.callback import EvaluationMonitor
+from xgboost.core import XGBoostError
 
 TEMP_FILE_SUFFIX = ".sagemaker-ignore"
 FILE_LOCK_SUFFIX = ".sagemaker-uploading"
@@ -42,29 +43,33 @@ def train(train_args, checkpoint_dir):
 
     xgb_model, start_iteration = load_checkpoint(checkpoint_dir)
 
+    # xgboost's default value for num_boost_round is 10.
+    # https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.training
+    # If num_boost_round <= 0, xgb.train() doesn't actually train and
+    # immediately returns a Booster object.
+    train_args["num_boost_round"] = train_args.get("num_boost_round", 10) - start_iteration
+
     if xgb_model is not None:
         logging.info("Checkpoint loaded from %s", xgb_model)
         logging.info("Resuming from iteration %s", start_iteration)
 
     callbacks = train_args.get("callbacks", [])
-    callbacks.append(print_checkpointed_evaluation(start_iteration=start_iteration))
-    callbacks.append(save_checkpoint(checkpoint_dir, start_iteration=start_iteration))
+    callbacks.append(print_checkpointed_evaluation(start_iteration=start_iteration,
+                                                   end_iteration=train_args["num_boost_round"]))
+    callbacks.append(save_checkpoint(checkpoint_dir, start_iteration=start_iteration, iteration=start_iteration,
+                                     end_iteration=train_args["num_boost_round"]))
 
     train_args["verbose_eval"] = False  # suppress xgboost's print_evaluation()
     train_args["xgb_model"] = xgb_model
     train_args["callbacks"] = callbacks
-    # xgboost's default value for num_boost_round is 10.
-    # If num_boost_round <= 0, xgb.train() doesn't actually train and
-    # immediately returns a Booster object.
-    train_args["num_boost_round"] = train_args.get("num_boost_round", 10) - start_iteration
 
     booster = xgb.train(**train_args)
 
     return booster
 
 
-def print_checkpointed_evaluation(period=1, show_stdv=True, start_iteration=0):
-    """Create a callback that print evaluation result.
+class PrintCheckpoint(xgb.callback.TrainingCallback):
+    """Create a callback that print evaluation result every period iteration.
 
     This function was modified from https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/callback.py
     The only difference between the following function and the original function in xgboost.callback
@@ -73,41 +78,62 @@ def print_checkpointed_evaluation(period=1, show_stdv=True, start_iteration=0):
     We print the evaluation results every **period** iterations
     and on the first and the last iterations.
 
-    Parameters
+    Attributes
     ----------
     period : int
-        The period to log the evaluation results
-
+    The period to log the evaluation results
     show_stdv : bool, optional
-        Whether show stdv if provided
-
+    Whether show stdv if provided
     start_iteration: int, optioonal
-        Used for offsetting the iteratoin number that appears at the beginning of each evaluation result in the logs.
-
-    Returns
-    -------
-    callback : function
-        A callback that print evaluation every period iterations.
+    Used for offsetting the iteratoin number that appears at the beginning of each evaluation result in the logs.
     """
 
-    def callback(env):
-        """internal function"""
-        if env.rank != 0 or (not env.evaluation_result_list) or period is False or period == 0:
-            return
-        i = env.iteration
-        if i % period == 0 or i + 1 == env.begin_iteration or i + 1 == env.end_iteration:
-            msg = "\t".join([format_metric(x, show_stdv) for x in env.evaluation_result_list])
-            rabit.tracker_print("[%d]\t%s\n" % (i + start_iteration, msg))
+    def __init__(self, end_iteration, iteration=0, rank=0, period=1, show_stdv=True, start_iteration=0):
+        self.period = period
+        self.show_stdv = show_stdv
+        self.start_iteration = start_iteration
+        self.rank = rank
+        self.iteration = iteration
+        self.end_iteration = end_iteration
 
-    return callback
+    def __call__(self, model, epoch=0, evals_log=None):
+        return self.after_iteration(model, epoch, evals_log)
+
+    def after_iteration(self, model, epoch=0, evals_log=None):
+        if self.rank != 0 or (not evals_log) or self.period is False or self.period == 0:
+            return
+        i = self.iteration
+        if i % self.period == 0 or i + 1 == self.start_iteration or i + 1 == self.end_iteration:
+            evaluation_monitor = EvaluationMonitor(self.rank, self.period, self.show_stdv)
+            msg: str = ""
+            for data, metric in evals_log.items():
+                for metric_name, log in metric.items():
+                    stdv: Optional[float] = None
+                    if isinstance(log[-1], tuple):
+                        score = log[-1][0]
+                        stdv = log[-1][1]
+                    else:
+                        score = log[-1]
+                    msg += evaluation_monitor._fmt_metric(data, metric_name, score, stdv)
+            msg += "\n"
+            rabit.tracker_print("[%d]\t%s\n" % (i + self.start_iteration, msg))
+
+
+def print_checkpointed_evaluation(end_iteration, iteration=0, rank=0, period=1, show_stdv=True, start_iteration=0):
+    """A callback function that print evaluation result every period iteration.
+
+    This is a wrapper function around PrintCheckpoint.
+    For details, see PrintCheckpoint.
+    """
+    return PrintCheckpoint(end_iteration, iteration, rank, period, show_stdv, start_iteration)
 
 
 def load_checkpoint(checkpoint_dir, max_try=5):
     """
     :param checkpoint_dir: e.g., /opt/ml/checkpoints
     :param max_try: number of times to try loading checkpoint before giving up.
     :return xgb_model: file path of stored xgb model. None if no checkpoint.
-    :return iteration: iterations completed before last checkpoiint.
+    :return iteration: iterations completed before last checkpoint.
     """
     if not checkpoint_dir or not os.path.exists(checkpoint_dir):
         return None, 0
@@ -124,9 +150,6 @@ def load_checkpoint(checkpoint_dir, max_try=5):
         try:
             latest_checkpoint = checkpoints.pop()
             xgb_model = os.path.join(checkpoint_dir, latest_checkpoint)
-            booster = Booster()
-            booster.load_model(xgb_model)
-
             filename, extension = latest_checkpoint.split(".")
             iteration = int(extension) + 1
             break
@@ -141,18 +164,20 @@ def _sort_checkpoints(checkpoint_files):
     return checkpoint_files
 
 
-def save_checkpoint(checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None):
+def save_checkpoint(checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None, rank=0, iteration=0,
+                    end_iteration=None):
     """A callback function that saves checkpoints to disk.
 
     This is a wrapper function around SaveCheckpoint.
     For details, see SaveCheckpoint.
     """
-    return SaveCheckpoint(
-        checkpoint_dir=checkpoint_dir, start_iteration=start_iteration, max_to_keep=max_to_keep, num_round=num_round
+    return SaveCheckpointCallBack(
+        checkpoint_dir=checkpoint_dir, start_iteration=start_iteration, max_to_keep=max_to_keep, num_round=num_round,
+        iteration=iteration, end_iteration=end_iteration
     )
 
 
-class SaveCheckpoint(object):
+class SaveCheckpointCallBack(xgb.callback.TrainingCallback):
     """Create a callback that saves checkpoints to disk.
 
     The main purpose of this class is to support checkpointing for managed spot
@@ -192,19 +217,23 @@ class SaveCheckpoint(object):
             after round 19, start_iteration will be 20).
         num_round: (optional) indicates the number of boosting rounds.
 
-    Example:
-        >>> save_checkpoint = SaveCheckpoint("/opt/ml/checkpoints")
-        >>> xgboost.train(prams, dtrain, callbacks=[save_checkpoint])
-    """
+        Example:
+            >>> save_checkpoint = SaveCheckpoint("/opt/ml/checkpoints")
+            >>> xgboost.train(prams, dtrain, callbacks=[save_checkpoint])
+        """
 
     SENTINEL = None
 
-    def __init__(self, checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None):
+    def __init__(self, checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=None, rank=0, iteration=0,
+                 end_iteration=None):
         """Init SaveCheckpoint with checkpoint_dir"""
         self.checkpoint_dir = checkpoint_dir
         self.max_to_keep = max_to_keep
         self.start_iteration = start_iteration
         self.num_round = num_round
+        self.rank = rank
+        self.iteration = iteration
+        self.end_iteration = end_iteration
 
         if not os.path.exists(self.checkpoint_dir):
             os.makedirs(self.checkpoint_dir)
@@ -215,16 +244,46 @@ def __init__(self, checkpoint_dir, start_iteration=0, max_to_keep=5, num_round=N
 
         self.start()
 
-    def __call__(self, env):
+    def __call__(self, model, epoch=0, evals_log=None):
         """Make the class callable since it is meant be used as a callback"""
-        return self.callback(env)
+        return self.after_iteration(model, epoch, evals_log)
 
     def format_path(self, iteration):
         """Return a file path to checkpoint given a iteration number"""
         filename = "{}.{}".format(CHECKPOINT_FILENAME, iteration)
         checkpoint_path = os.path.join(self.checkpoint_dir, filename)
         return checkpoint_path
 
+    def after_iteration(self, model, epoch=0, evals_log=None) -> bool:
+        # rank: master node has rank 0.
+        # iteration: current boosting round
+        # end_iteration: round # when training will end. this is always num_round + 1.
+        # model: model object
+        if self.rank != 0:
+            logger.debug("Not master (rank = %d). Exiting checkpoint callback.", self.rank)
+            return
+
+        if len(os.listdir(self.checkpoint_dir)) != 0:
+            xgb_model, self.iteration = load_checkpoint(self.checkpoint_dir)
+            current_iteration = self.iteration
+        else:
+            current_iteration = self.start_iteration + self.iteration
+        self._save_checkpoint(model, current_iteration)
+
+        # For example, if we are at iteration 5 and max_to_keep is 5, we no
+        # longer need checkpoint from iteration 0 (i.e., xgboost-checkpoint.0),
+        # so we put iteration_to_delete = 0 on the queue.
+        iteration_to_delete = current_iteration - self.max_to_keep
+        self.delete_queue.put(iteration_to_delete)
+
+        offset_iteration = self.end_iteration if self.num_round is None else self.num_round
+
+        training_has_ended = current_iteration + 1 >= self.start_iteration + offset_iteration
+
+        if training_has_ended:
+            self.stop()
+        return False
+
     def start(self):
         """Start a background thread that deletes old checkpoints
 
@@ -236,7 +295,6 @@ def start(self):
         When training is complete, we put SENTINEL on the queue, and when we
         see the SENTINEL, we clean up and exit the thread.
         """
-
         def _is_uploading(path):
             uploading = os.path.isfile(path + FILE_LOCK_SUFFIX)
             uploaded = os.path.isfile(path + FILE_SAFE_SUFFIX)
@@ -286,7 +344,9 @@ def _delete_uploaded_files_and_cleanup():
             _delete_uploaded_files()
             _cleanup()
 
-        self.thread = threading.Thread(target=_delete_uploaded_files_and_cleanup, daemon=True)
+        self.thread = threading.Thread(
+            target=_delete_uploaded_files_and_cleanup,
+            daemon=True)
         self.thread.start()
 
     def stop(self):
@@ -304,30 +364,6 @@ def _save_checkpoint(self, model, iteration):
         save_file_path = self.format_path(iteration)
         os.rename(tf.name, save_file_path)
 
-    def callback(self, env):
-        # env.rank: rabit rank of the node/process. master node has rank 0.
-        # env.iteration: current boosting round
-        # env.begin_iteration: round # when training started. this is always 0.
-        # env.end_iteration: round # when training will end. this is always num_round + 1.
-        # env.model: model object
-        if env.rank != 0:
-            logger.debug("Not master (rank = %d). Exiting checkpoint callback.", env.rank)
-            return
-
-        current_iteration = self.start_iteration + env.iteration
-        self._save_checkpoint(env.model, current_iteration)
-
-        # For example, if we are at iteration 5 and max_to_keep is 5, we no
-        # longer need checkpoint from iteration 0 (i.e., xgboost-checkpoint.0),
-        # so we put iteration_to_delete = 0 on the queue.
-        iteration_to_delete = current_iteration - self.max_to_keep
-        self.delete_queue.put(iteration_to_delete)
-
-        offset_iteration = env.end_iteration if self.num_round is None else self.num_round
-        training_has_ended = current_iteration + 1 >= self.start_iteration + offset_iteration
-        if training_has_ended:
-            self.stop()
-
 
 def save_intermediate_model(intermediate_model_dir, model_name):
     """A callback function that saves intermediate models to disk.

diff --git a/test/resources/versions/train.py b/test/resources/versions/train.py
@@ -11,7 +11,7 @@
 boto3==1.17.52
 botocore==1.20.52
 conda==4.10.1
-cryptography==35.0.0
+cryptography==39.0.1
 gunicorn==19.10.0
 matplotlib==3.4.1
 multi-model-server==1.1.2