diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 index 1e56411c108..668e5e56695 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-cu118 @@ -57,7 +57,7 @@ ARG PYTHON_EXE=$MINICONDA_PREFIX/bin/python # (Optional) Intall test dependencies RUN $PYTHON_EXE -m pip install git+https://github.com/huggingface/transformers RUN $PYTHON_EXE -m pip install datasets accelerate evaluate coloredlogs absl-py rouge_score seqeval scipy sacrebleu nltk scikit-learn parameterized sentencepiece -RUN $PYTHON_EXE -m pip install fairscale deepspeed mpi4py +RUN $PYTHON_EXE -m pip install deepspeed mpi4py # RUN $PYTHON_EXE -m pip install optuna ray sigopt wandb # PyTorch diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 b/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 index a7068eb6acb..db2219b5c62 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.14.1-cu116 @@ -48,7 +48,7 @@ RUN pip install pygit2 pgzip # (Optional) Intall test dependencies RUN pip install git+https://github.com/huggingface/transformers RUN pip install datasets accelerate evaluate coloredlogs absl-py rouge_score seqeval scipy sacrebleu nltk scikit-learn parameterized sentencepiece -RUN pip install fairscale deepspeed mpi4py +RUN pip install deepspeed mpi4py # RUN pip install optuna ray sigopt wandb # Install onnxruntime-training dependencies diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 index f267cc1fcf5..51c9ec514c4 100644 --- a/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 +++ b/examples/onnxruntime/training/docker/Dockerfile-ort1.15.1-cu118 @@ -57,7 +57,7 @@ ARG PYTHON_EXE=$MINICONDA_PREFIX/bin/python # (Optional) Intall test dependencies RUN $PYTHON_EXE -m pip install git+https://github.com/huggingface/transformers RUN $PYTHON_EXE -m pip install datasets accelerate evaluate coloredlogs absl-py rouge_score seqeval scipy sacrebleu nltk scikit-learn parameterized sentencepiece -RUN $PYTHON_EXE -m pip install fairscale deepspeed mpi4py +RUN $PYTHON_EXE -m pip install deepspeed mpi4py # RUN $PYTHON_EXE -m pip install optuna ray sigopt wandb # PyTorch diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 1244d8bb4b8..89363b6b266 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -28,10 +28,7 @@ # Integrations must be imported before ML frameworks: # isort: off -from transformers.integrations import ( - hp_params, - is_fairscale_available, -) +from transformers.integrations import hp_params from transformers.utils import is_accelerate_available from packaging import version @@ -60,7 +57,6 @@ from transformers.data.data_collator import DataCollator from transformers.debug_utils import DebugOption, DebugUnderflowOverflow from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled -from transformers.dependency_versions_check import dep_version_check from transformers.file_utils import ( is_apex_available, is_sagemaker_dp_enabled, @@ -88,7 +84,6 @@ EvalPrediction, HPSearchBackend, PredictionOutput, - ShardedDDPOption, TrainOutput, denumpify_detensorize, enable_full_determinism, @@ -133,11 +128,6 @@ if is_torch_tpu_available(check_device=False): import torch_xla.core.xla_model as xm -if is_fairscale_available(): - dep_version_check("fairscale") - from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP - from fairscale.optim import OSS - if TYPE_CHECKING: import optuna @@ -533,12 +523,7 @@ def _inner_training_loop( else: debug_overflow = DebugUnderflowOverflow(self.model) # noqa - delay_optimizer_creation = ( - self.sharded_ddp is not None - and self.sharded_ddp != ShardedDDPOption.SIMPLE - or self.fsdp is not None - or self.is_fsdp_enabled - ) + delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled # Wrap the model with `ORTModule` logger.info("Wrap ORTModule for ONNX Runtime training.") @@ -582,7 +567,7 @@ def _inner_training_loop( # as the model is wrapped, don't use `accelerator.prepare` # this is for unhandled cases such as - # Fairscale Sharded DDP, FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX + # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX use_accelerator_prepare = True if model is self.model else False if delay_optimizer_creation: @@ -793,10 +778,6 @@ def _inner_training_loop( if args.max_grad_norm is not None and args.max_grad_norm > 0: # deepspeed does its own clipping - if self.do_grad_scaling: - # AMP: gradients need unscaling - self.scaler.unscale_(self.optimizer) - if is_sagemaker_mp_enabled() and args.fp16: self.optimizer.clip_master_grads(args.max_grad_norm) elif hasattr(self.optimizer, "clip_grad_norm"): @@ -812,18 +793,8 @@ def _inner_training_loop( ) # Optimizer step - optimizer_was_run = True - if is_torch_tpu_available(): - raise NotImplementedError("`ORTTrainer` is not supported by TPU!") - elif self.do_grad_scaling: - scale_before = self.scaler.get_scale() - self.scaler.step(self.optimizer) - self.scaler.update() - scale_after = self.scaler.get_scale() - optimizer_was_run = scale_before <= scale_after - else: - self.optimizer.step() - optimizer_was_run = not self.accelerator.optimizer_step_was_skipped + self.optimizer.step() + optimizer_was_run = not self.accelerator.optimizer_step_was_skipped if optimizer_was_run: # Delay optimizer scheduling until metrics are generated @@ -1689,19 +1660,8 @@ def _wrap_model(self, model, training=True, dataloader=None): if not training: return model - # Distributed training (should be after apex fp16 initialization) - if self.sharded_ddp is not None: - # Sharded DDP! - if self.sharded_ddp == ShardedDDPOption.SIMPLE: - model = ShardedDDP(model, self.optimizer) - else: - raise NotImplementedError( - "Fairscale's zero_dp_2 and zero_dp_3 are not compatible with `torch_ort.ORTModule`" - " used in `ORTTrainer`. Use `--sharded_ddp simpe` or deepspeed stage 2 if you want" - "the gradient to be sharded." - ) # Distributed training using PyTorch FSDP - elif self.fsdp is not None: + if self.fsdp is not None: try: from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP from torch_xla.distributed.fsdp import checkpoint_module @@ -1806,27 +1766,20 @@ def create_optimizer(self): else: optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) - if self.sharded_ddp == ShardedDDPOption.SIMPLE: - self.optimizer = OSS( - params=optimizer_grouped_parameters, - optim=optimizer_cls, - **optimizer_kwargs, - ) - else: - self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) - if optimizer_cls.__name__ == "Adam8bit": - import bitsandbytes - - manager = bitsandbytes.optim.GlobalOptimManager.get_instance() - - skipped = 0 - for module in opt_model.modules(): - if isinstance(module, nn.Embedding): - skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values()) - logger.info(f"skipped {module}: {skipped/2**20}M params") - manager.register_module_override(module, "weight", {"optim_bits": 32}) - logger.debug(f"bitsandbytes: will optimize {module} in fp32") - logger.info(f"skipped: {skipped/2**20}M params") + self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) + if optimizer_cls.__name__ == "Adam8bit": + import bitsandbytes + + manager = bitsandbytes.optim.GlobalOptimManager.get_instance() + + skipped = 0 + for module in opt_model.modules(): + if isinstance(module, nn.Embedding): + skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values()) + logger.info(f"skipped {module}: {skipped/2**20}M params") + manager.register_module_override(module, "weight", {"optim_bits": 32}) + logger.debug(f"bitsandbytes: will optimize {module} in fp32") + logger.info(f"skipped: {skipped/2**20}M params") if is_sagemaker_mp_enabled(): raise NotImplementedError( diff --git a/optimum/onnxruntime/trainer_seq2seq.py b/optimum/onnxruntime/trainer_seq2seq.py index 0528e3f6f79..a281462fdef 100644 --- a/optimum/onnxruntime/trainer_seq2seq.py +++ b/optimum/onnxruntime/trainer_seq2seq.py @@ -547,7 +547,6 @@ def prediction_step_ort( has_labels = "labels" in inputs inputs = self._prepare_inputs(inputs) - # XXX: adapt synced_gpus for fairscale as well # Priority (handled in generate): # gen_kwargs > model.generation_config > default GenerationConfig() @@ -658,7 +657,6 @@ def prediction_step( has_labels = "labels" in inputs inputs = self._prepare_inputs(inputs) - # XXX: adapt synced_gpus for fairscale as well # Priority (handled in generate): # gen_kwargs > model.generation_config > default GenerationConfig() diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py index a15ba263295..88e5fc9bc2e 100644 --- a/optimum/onnxruntime/training_args.py +++ b/optimum/onnxruntime/training_args.py @@ -29,7 +29,6 @@ HubStrategy, IntervalStrategy, SchedulerType, - ShardedDDPOption, ) from transformers.training_args import OptimizerNames, default_logdir, logger from transformers.utils import ( @@ -211,8 +210,6 @@ def __post_init__(self): " `--half_precision_backend apex`: GPU bf16 is not supported by apex. Use" " `--half_precision_backend cuda_amp` instead" ) - if not (self.sharded_ddp == "" or not self.sharded_ddp): - raise ValueError("sharded_ddp is not supported with bf16") if self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU: if self.evaluation_strategy == IntervalStrategy.NO: @@ -329,26 +326,6 @@ def __post_init__(self): " during training" ) - if not (self.sharded_ddp == "" or not self.sharded_ddp): - warnings.warn( - "using `sharded_ddp` is deprecated and will be removed in version 4.33" - " of 🤗 Transformers. Use `fsdp` instead", - FutureWarning, - ) - if isinstance(self.sharded_ddp, bool): - self.sharded_ddp = "simple" if self.sharded_ddp else "" - if isinstance(self.sharded_ddp, str): - self.sharded_ddp = [ShardedDDPOption(s) for s in self.sharded_ddp.split()] - if self.sharded_ddp == [ShardedDDPOption.OFFLOAD]: - raise ValueError( - "`--sharded_ddp offload` can't work on its own. It needs to be added to `--sharded_ddp zero_dp_2` or " - '`--sharded_ddp zero_dp_3`. For example, `--sharded_ddp "zero_dp_2 offload"`.' - ) - elif len(self.sharded_ddp) > 1 and ShardedDDPOption.SIMPLE in self.sharded_ddp: - raise ValueError("`--sharded_ddp simple` is not compatible with any other option.") - elif ShardedDDPOption.ZERO_DP_2 in self.sharded_ddp and ShardedDDPOption.ZERO_DP_3 in self.sharded_ddp: - raise ValueError("`--sharded_ddp zero_dp_2` is not compatible with `--sharded_ddp zero_dp_3`.") - if isinstance(self.fsdp, bool): self.fsdp = "full_shard" if self.fsdp else "" if isinstance(self.fsdp, str): @@ -516,7 +493,7 @@ def __post_init__(self): ) # if training args is specified, it will override the one specified in the accelerate config - if self.half_precision_backend != "apex" and len(self.sharded_ddp) == 0: + if self.half_precision_backend != "apex": mixed_precision_dtype = os.environ.get("ACCELERATE_MIXED_PRECISION", "no") if self.fp16: mixed_precision_dtype = "fp16" diff --git a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer index a1b3b56bd99..62f7efc8178 100644 --- a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer +++ b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer @@ -57,7 +57,7 @@ ARG PYTHON_EXE=$MINICONDA_PREFIX/bin/python # (Optional) Intall test dependencies RUN $PYTHON_EXE -m pip install git+https://github.com/huggingface/transformers RUN $PYTHON_EXE -m pip install datasets accelerate evaluate coloredlogs absl-py rouge_score seqeval scipy sacrebleu nltk scikit-learn parameterized sentencepiece -RUN $PYTHON_EXE -m pip install fairscale deepspeed mpi4py +RUN $PYTHON_EXE -m pip install deepspeed mpi4py # RUN $PYTHON_EXE -m pip install optuna ray sigopt wandb # PyTorch