Merge pull request #67 from ROCm/fix_stable_train_samples

Fix stable_train_samples calculation
ROCm · Jan 9, 2025 · d8b4c30 · d8b4c30
2 parents 3c34382 + 799dd28
commit d8b4c30
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 6 deletions.
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -2485,7 +2485,7 @@ def _inner_training_loop(
                     else:
                         self.accelerator.gradient_state._set_sync_gradients(True)
 
-                    if (self.state.global_step == 10):
+                    if (self.state.global_step == args.stable_train_warmup_steps):
                         start_train_stable_time = time.time()
 
                     if self.args.include_num_input_tokens_seen:
@@ -2659,9 +2659,8 @@ def _inner_training_loop(
 
         metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps,num_tokens=num_train_tokens,)
 
-        total_samples = self.state.global_step*total_train_batch_size if args.max_steps > 0 else num_examples*num_train_epochs
-        perf_samples = total_samples - self.args.warmup_steps*total_train_batch_size
-        stable_train_metrics = speed_metrics("stable_train", start_train_stable_time, perf_samples)
+        stable_train_samples = num_train_samples - args.stable_train_warmup_steps*total_train_batch_size
+        stable_train_metrics = speed_metrics("stable_train", start_train_stable_time, stable_train_samples)
 
         self.store_flos()
         metrics["total_flos"] = self.state.total_flos

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -315,6 +315,8 @@ class TrainingArguments:
             Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
         warmup_steps (`int`, *optional*, defaults to 0):
             Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`.
+        stable_train_warmup_steps (`int`, *optional*, defaults to 0):
+            Number of steps to skip before collecting performance numbers for stable_train_samples_per_second.
         log_level (`str`, *optional*, defaults to `passive`):
             Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug',
             'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and keeps the
@@ -604,8 +606,7 @@ class TrainingArguments:
                     Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined before calling `TrainingArguments`.
                     If `True`, an `Accelerator` or `PartialState` must be initialized. Note that by doing so, this could lead to issues
                     with hyperparameter tuning.
-
-        ortmodule (:obj:`bool`, `optional`):
+        ort (:obj:`bool`, `optional`):
             Use `ORTModule <https://github.com/microsoft/onnxruntime>`__.
         label_smoothing_factor (`float`, *optional*, defaults to 0.0):
             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
@@ -922,6 +923,7 @@ class TrainingArguments:
         default=0.0, metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."}
     )
     warmup_steps: int = field(default=10, metadata={"help": "Linear warmup over warmup_steps."})
+    stable_train_warmup_steps: int = field(default=0, metadata={"help": "warmup steps to skip before collecting training performance."})
 
     log_level: Optional[str] = field(
         default="passive",