From 330926552bf6c020270402059d12e22bd707e89b Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Mon, 18 Nov 2024 05:09:48 -0800 Subject: [PATCH 1/4] fix stable_train_samples --- src/transformers/trainer.py | 7 +++---- src/transformers/training_args.py | 9 +++------ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index b4099d98f25a53..c011db69cb102a 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2344,7 +2344,7 @@ def _inner_training_loop( self._load_rng_state(resume_from_checkpoint) rng_to_sync = False - if (self.state.global_step == 10): + if (self.state.global_step == args.stable_train_warmup_steps): start_train_stable_time = time.time() # Skip past any already trained steps if resuming training @@ -2499,9 +2499,8 @@ def _inner_training_loop( metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps,num_tokens=num_train_tokens,) - total_samples = self.state.global_step*total_train_batch_size if args.max_steps > 0 else num_examples*num_train_epochs - perf_samples = total_samples - self.args.warmup_steps*total_train_batch_size - stable_train_metrics = speed_metrics("stable_train", start_train_stable_time, perf_samples) + stable_train_samples = num_train_samples - args.stable_train_warmup_steps*total_train_batch_size + stable_train_metrics = speed_metrics("stable_train", start_train_stable_time, stable_train_samples) self.store_flos() metrics["total_flos"] = self.state.total_flos diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 435cd3938449d4..94987059aa7dc4 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -310,6 +310,8 @@ class TrainingArguments: Ratio of total training steps used for a linear warmup from 0 to `learning_rate`. warmup_steps (`int`, *optional*, defaults to 0): Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`. + stable_train_warmup_steps (`int`, *optional*, defaults to 10): + Number of steps to skip before collecting performance numbers for stable_train_samples_per_second. log_level (`str`, *optional*, defaults to `passive`): Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug', 'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and keeps the @@ -599,8 +601,6 @@ class TrainingArguments: If `True`, an `Accelerator` or `PartialState` must be initialized. Note that by doing so, this could lead to issues with hyperparameter tuning. - ortmodule (:obj:`bool`, `optional`): - Use `ORTModule `__. label_smoothing_factor (`float`, *optional*, defaults to 0.0): The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor + @@ -912,6 +912,7 @@ class TrainingArguments: default=0.0, metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."} ) warmup_steps: int = field(default=10, metadata={"help": "Linear warmup over warmup_steps."}) + stable_train_warmup_steps: int = field(default=10, metadata={"help": "warmup steps to skip before collecting training performance."}) log_level: Optional[str] = field( default="passive", @@ -1258,10 +1259,6 @@ class TrainingArguments: ) }, ) - ort: Optional[bool] = field( - default=False, - metadata={"help": "Enable Ort"}, - ) label_smoothing_factor: float = field( default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."} ) From fa8b8c8a341e9eb63b9e796042dc971c9e1e747c Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 3 Dec 2024 13:03:16 -0600 Subject: [PATCH 2/4] wip --- src/transformers/training_args.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 15d058f18be402..de9500c02a187c 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -606,7 +606,8 @@ class TrainingArguments: Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined before calling `TrainingArguments`. If `True`, an `Accelerator` or `PartialState` must be initialized. Note that by doing so, this could lead to issues with hyperparameter tuning. - + ort (:obj:`bool`, `optional`): + Use `ORTModule `__. label_smoothing_factor (`float`, *optional*, defaults to 0.0): The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor + @@ -1269,6 +1270,10 @@ class TrainingArguments: ) }, ) + ort: Optional[bool] = field( + default=False, + metadata={"help": "Enable Ort"}, + ) label_smoothing_factor: float = field( default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."} ) From 8b3dce22c094e1cc16d6258e8e7bad97807f1de3 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 3 Dec 2024 13:04:01 -0600 Subject: [PATCH 3/4] wip --- src/transformers/training_args.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index de9500c02a187c..ac82abdc0a2dd2 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -315,7 +315,7 @@ class TrainingArguments: Ratio of total training steps used for a linear warmup from 0 to `learning_rate`. warmup_steps (`int`, *optional*, defaults to 0): Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`. - stable_train_warmup_steps (`int`, *optional*, defaults to 10): + stable_train_warmup_steps (`int`, *optional*, defaults to 0): Number of steps to skip before collecting performance numbers for stable_train_samples_per_second. log_level (`str`, *optional*, defaults to `passive`): Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug', @@ -923,7 +923,7 @@ class TrainingArguments: default=0.0, metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."} ) warmup_steps: int = field(default=10, metadata={"help": "Linear warmup over warmup_steps."}) - stable_train_warmup_steps: int = field(default=10, metadata={"help": "warmup steps to skip before collecting training performance."}) + stable_train_warmup_steps: int = field(default=0, metadata={"help": "warmup steps to skip before collecting training performance."}) log_level: Optional[str] = field( default="passive", From 799dd2883d5e1df5d340e266890cd7bf0626a79f Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Tue, 3 Dec 2024 17:56:09 -0600 Subject: [PATCH 4/4] fix merge --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index e2c05350d08fe5..fe4d9e56ca5f4b 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2485,7 +2485,7 @@ def _inner_training_loop( else: self.accelerator.gradient_state._set_sync_gradients(True) - if (self.state.global_step == 10): + if (self.state.global_step == args.stable_train_warmup_steps): start_train_stable_time = time.time() if self.args.include_num_input_tokens_seen: