Skip to content

Commit

Permalink
fix when testing
Browse files Browse the repository at this point in the history
  • Loading branch information
JingyaHuang committed Sep 15, 2023
1 parent 15afde4 commit fec39e3
Showing 1 changed file with 9 additions and 18 deletions.
27 changes: 9 additions & 18 deletions optimum/onnxruntime/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@
IterableDatasetShard,
SequentialDistributedSampler,
find_batch_size,
get_dataloader_sampler,
get_model_param_count,
get_module_class_from_name,
get_parameter_names,
Expand Down Expand Up @@ -342,7 +341,7 @@ def __init__(

if feature is None:
try:
self.feature = TasksManager.infer_task_from_model(args.model)
self.feature = TasksManager.infer_task_from_model(self.model)
except KeyError:
pass
else:
Expand Down Expand Up @@ -516,7 +515,6 @@ def _inner_training_loop(
total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size

len_dataloader = None
num_train_tokens = None
if has_length(train_dataloader):
len_dataloader = len(train_dataloader)
num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps
Expand All @@ -530,25 +528,17 @@ def _inner_training_loop(
# May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
# the best we can do.
num_train_samples = args.max_steps * total_train_batch_size
if args.include_tokens_per_second:
num_train_tokens = (
self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
)
else:
max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
num_train_epochs = math.ceil(args.num_train_epochs)
num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs
if args.include_tokens_per_second:
num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs
elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size
max_steps = args.max_steps
# Setting a very large number of epochs so we go as many times as necessary over the iterator.
num_train_epochs = sys.maxsize
num_update_steps_per_epoch = max_steps
num_examples = total_train_batch_size * args.max_steps
num_train_samples = args.max_steps * total_train_batch_size
if args.include_tokens_per_second:
num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
else:
raise ValueError(
"args.max_steps must be set to a positive value if dataloader does not have a length, was"
Expand Down Expand Up @@ -753,6 +743,13 @@ def _inner_training_loop(

self.control = self.callback_handler.on_train_begin(args, self.state, self.control)

# Temp: remove after transformers 4.34 release
def get_dataloader_sampler(dataloader):
if hasattr(dataloader, "batch_sampler") and dataloader.batch_sampler is not None:
return get_dataloader_sampler(dataloader.batch_sampler)
elif hasattr(dataloader, "sampler"):
return dataloader.sampler

# Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
if not args.ignore_data_skip:
for epoch in range(epochs_trained):
Expand Down Expand Up @@ -935,13 +932,7 @@ def _inner_training_loop(
self._total_loss_scalar += tr_loss.item()
train_loss = self._total_loss_scalar / self.state.global_step

metrics = speed_metrics(
"train",
start_time,
num_samples=num_train_samples,
num_steps=self.state.max_steps,
num_tokens=num_train_tokens,
)
metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps)
self.store_flos()
metrics["total_flos"] = self.state.total_flos
metrics["train_loss"] = train_loss
Expand Down

0 comments on commit fec39e3

Please sign in to comment.