From e2ac4b50825f9a0f9861a5a481cf143c20af70a9 Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Fri, 2 Oct 2020 10:44:06 -0400 Subject: [PATCH 01/20] first draft of deepspeed trainer --- allennlp/training/__init__.py | 1 + allennlp/training/deepspeed_trainer.py | 856 ++++++++++++++++++ .../training/metrics/categorical_accuracy.py | 2 +- 3 files changed, 858 insertions(+), 1 deletion(-) create mode 100644 allennlp/training/deepspeed_trainer.py diff --git a/allennlp/training/__init__.py b/allennlp/training/__init__.py index 0fc7185d536..9cb0154bea0 100644 --- a/allennlp/training/__init__.py +++ b/allennlp/training/__init__.py @@ -1,6 +1,7 @@ from allennlp.training.checkpointer import Checkpointer from allennlp.training.tensorboard_writer import TensorboardWriter from allennlp.training.no_op_trainer import NoOpTrainer +from allennlp.training.deepspeed_trainer import DeepspeedTrainer from allennlp.training.trainer import ( Trainer, GradientDescentTrainer, diff --git a/allennlp/training/deepspeed_trainer.py b/allennlp/training/deepspeed_trainer.py new file mode 100644 index 00000000000..6ef41b27f6b --- /dev/null +++ b/allennlp/training/deepspeed_trainer.py @@ -0,0 +1,856 @@ +import logging +from deepspeed.utils import logger as ds_logger +ds_logger.setLevel(logging.WARNING) +ds_logger.propagate = False + +import datetime +import logging +import os +import re +import math +import json +import tempfile +import time +import traceback +from copy import deepcopy +from contextlib import contextmanager +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union + +from allennlp.common.util import int_to_device + +import torch +import torch.distributed as dist +from torch.cuda import amp +from torch.nn.parallel import DistributedDataParallel +from torch.nn.utils import clip_grad_norm_ + +import deepspeed + +from allennlp.common import Lazy, Registrable, Tqdm, Params, FromParams +from allennlp.common import util as common_util +from allennlp.common.checks import ConfigurationError, check_for_gpu +from allennlp.data import DataLoader +from allennlp.data.dataloader import TensorDict +from allennlp.models.model import Model +from allennlp.nn import util as nn_util +from allennlp.training import util as training_util +from allennlp.training.checkpointer import Checkpointer +from allennlp.training.metric_tracker import MetricTracker +from allennlp.training.moving_average import MovingAverage +from allennlp.training.optimizers import Optimizer +from allennlp.training.tensorboard_writer import TensorboardWriter +from allennlp.training.trainer import Trainer, BatchCallback, EpochCallback + +logger = logging.getLogger(__name__) + +JsonDict = Dict[str, Any] + +class DeepspeedConfig(FromParams): + def __init__( + self, + optimizer: JsonDict, + fp16: JsonDict = {'enabled': False}, + amp: JsonDict = {'enabled': False}, + zero_optimization: Union[bool, Dict] = False, + zero_allow_untested_optimizer: bool = True, + wall_clock_breakdown: bool = False + ): + self.optimizer = optimizer + self.fp16 = fp16 + self.amp = amp + self.zero_optimization = zero_optimization + self.zero_allow_untested_optimizer = zero_allow_untested_optimizer + self.wall_clock_breakdown = wall_clock_breakdown + + @staticmethod + def build_deepspeed_args(deepspeed_config_path: str, local_rank: int = 0): + from argparse import ArgumentParser, Namespace + parser = ArgumentParser() + parser.add_argument('--local_rank', type=int, default=local_rank) + parser = deepspeed.add_config_arguments(parser) + + args, _ = parser.parse_known_args() + arg_dict = vars(args) + + arg_dict.update(dict(deepspeed_config=deepspeed_config_path, deepspeed=True, local_rank=local_rank)) + return Namespace(**arg_dict) + + @property + def config(self): + # return { + # 'fp16': self.fp16, + # 'amp': self.amp, + # 'zero_optimization': self.zero_optimization, + # 'zero_allow_untested_optimizer': self.zero_allow_untested_optimizer + # } + return vars(self) + + def _to_temp_file(self, serialization_dir, **kwargs): + fd, path = tempfile.mkstemp(dir=serialization_dir) + + config = {**self.config, **kwargs} + with os.fdopen(fd, 'w') as f: + f.write(json.dumps(config)) + + return path + + def launch( + self, + model: torch.nn.Module, + optimizer: Union[str, torch.optim.Optimizer], + local_rank: int, + serialization_dir: str, + batch_size: int, + gradient_accumulation_steps: int, + **kwargs + ): + path = self._to_temp_file(serialization_dir, train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) + ds = deepspeed.initialize( + args=self.build_deepspeed_args(path, local_rank), + model=model, + model_parameters=model.parameters(), + dist_init_required=False, + **kwargs + ) + + os.remove(path) + return ds + +@Trainer.register("deepspeed", constructor="from_partial_objects") +class DeepspeedTrainer(Trainer): + def __init__( + self, + model: Model, + optimizer: torch.optim.Optimizer, + data_loader: DataLoader, + deepspeed_config: DeepspeedConfig, + patience: Optional[int] = None, + validation_metric: str = "-loss", + validation_data_loader: DataLoader = None, + num_epochs: int = 20, + serialization_dir: Optional[str] = None, + checkpointer: Checkpointer = None, + cuda_device: Optional[Union[int, torch.device]] = None, + grad_norm: Optional[float] = None, + grad_clipping: Optional[float] = None, + tensorboard_writer: TensorboardWriter = None, + moving_average: Optional[MovingAverage] = None, + batch_callbacks: List[BatchCallback] = None, + epoch_callbacks: List[EpochCallback] = None, + distributed: bool = False, + local_rank: int = 0, + world_size: int = 1, + num_gradient_accumulation_steps: int = 1, + use_amp: bool = False, + ) -> None: + super().__init__(serialization_dir, cuda_device, distributed, local_rank, world_size) + + # I am not calling move_to_gpu here, because if the model is + # not already on the GPU then the optimizer is going to be wrong. + self.model = model + + self.data_loader = data_loader + self._validation_data_loader = validation_data_loader + self.optimizer = optimizer + + if patience is None: # no early stopping + if validation_data_loader is not None: + logger.warning( + "You provided a validation dataset but patience was set to None, " + "meaning that early stopping is disabled" + ) + elif (not isinstance(patience, int)) or patience <= 0: + raise ConfigurationError( + '{} is an invalid value for "patience": it must be a positive integer ' + "or None (if you want to disable early stopping)".format(patience) + ) + + # For tracking is_best_so_far and should_stop_early + self._metric_tracker = MetricTracker(patience, validation_metric) + # Get rid of + or - + self._validation_metric = validation_metric[1:] + + self._num_epochs = num_epochs + + if checkpointer is not None: + self._checkpointer = checkpointer + else: + self._checkpointer = Checkpointer(serialization_dir) + + self._grad_norm = grad_norm + self._grad_clipping = grad_clipping + + self._moving_average = moving_average + self._batch_callbacks = batch_callbacks or [] + self._epoch_callbacks = epoch_callbacks or [] + + # We keep the total batch number as an instance variable because it + # is used inside a closure for the hook which logs activations in + # `_enable_activation_logging`. + self._batch_num_total = 0 + + self._tensorboard = tensorboard_writer or TensorboardWriter(serialization_dir) + self._tensorboard.get_batch_num_total = lambda: self._batch_num_total + self._tensorboard.enable_activation_logging(self.model) + + self._last_log = 0.0 # time of last logging + + self._num_gradient_accumulation_steps = num_gradient_accumulation_steps + + # Enable automatic mixed precision training. + self._scaler: Optional[amp.GradScaler] = None + self._use_amp = use_amp + if self._use_amp: + if self.cuda_device == torch.device("cpu"): + raise ValueError("Using AMP requires a cuda device") + self._scaler = amp.GradScaler() + + self._pytorch_model = self.model + + self._ds_config = deepspeed_config + self.model_engine, self.ds_optimizer, _, _ = self._ds_config.launch( + self.model, + None, # self.optimizer, + local_rank, + serialization_dir, + self.data_loader.batch_size, + num_gradient_accumulation_steps + ) + + def mute_log(*args, **kwargs): + pass + + if hasattr(self.model_engine, 'timers'): + self.model_engine.timers.log = mute_log + + def batch_outputs(self, batch: TensorDict, for_training: bool) -> Dict[str, torch.Tensor]: + """ + Does a forward pass on the given batch and returns the output dictionary that the model + returns, after adding any specified regularization penalty to the loss (if training). + """ + # batch = nn_util.move_to_device(batch, self.cuda_device) + batch = nn_util.move_to_device(batch, self.model_engine.device) + output_dict = self.model_engine(**batch) + + if for_training: + try: + assert "loss" in output_dict + regularization_penalty = self.model.get_regularization_penalty() + + if regularization_penalty is not None: + output_dict["reg_loss"] = regularization_penalty + output_dict["loss"] += regularization_penalty + + except AssertionError: + if for_training: + raise RuntimeError( + "The model you are trying to optimize does not contain a" + " 'loss' key in the output of model.forward(inputs)." + ) + + return output_dict + + def _train_epoch(self, epoch: int) -> Dict[str, float]: + """ + Trains one epoch and returns metrics. + """ + logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) + cpu_memory_usage = [] + for worker, memory in common_util.peak_memory_mb().items(): + cpu_memory_usage.append((worker, memory)) + logger.info(f"Worker {worker} memory usage MB: {memory}") + gpu_memory_usage = [] + for gpu, memory in common_util.gpu_memory_mb().items(): + gpu_memory_usage.append((gpu, memory)) + logger.info(f"GPU {gpu} memory usage MB: {memory}") + + regularization_penalty = self.model.get_regularization_penalty() + + train_loss = 0.0 + batch_loss = 0.0 + + if regularization_penalty is not None: + train_reg_loss = 0.0 + batch_reg_loss = 0.0 + else: + train_reg_loss = None + batch_reg_loss = None + # Set the model to "train" mode. + self.model_engine.train() + + # Get tqdm for the training batches + batch_generator = iter(self.data_loader) + batch_group_generator = common_util.lazy_groups_of( + batch_generator, self._num_gradient_accumulation_steps + ) + + logger.info("Training") + + num_training_batches: Union[int, float] + try: + len_data_loader = len(self.data_loader) + num_training_batches = math.ceil( + len_data_loader / self._num_gradient_accumulation_steps + ) + except TypeError: + num_training_batches = float("inf") + + # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's + # progress is shown + batch_group_generator_tqdm = batch_group_generator + if self._master: + batch_group_generator_tqdm = Tqdm.tqdm( + batch_group_generator, total=num_training_batches + ) + + self._last_log = time.time() + + batches_this_epoch = 0 + if self._batch_num_total is None: + self._batch_num_total = 0 + + done_early = False + for batch_group in batch_group_generator_tqdm: + batches_this_epoch += 1 + self._batch_num_total += 1 + batch_num_total = self._batch_num_total + + self.optimizer.zero_grad() + + batch_group_outputs = [] + for batch in batch_group: + with amp.autocast(self._use_amp): + batch_outputs = self.batch_outputs(batch, for_training=True) + batch_group_outputs.append(batch_outputs) + loss = batch_outputs.get("loss") + reg_loss = batch_outputs.get("reg_loss") + if torch.isnan(loss): + raise ValueError("nan loss encountered") + loss = loss / len(batch_group) + + batch_loss = loss.item() + train_loss += batch_loss + if reg_loss is not None: + reg_loss = reg_loss / len(batch_group) + batch_reg_loss = reg_loss.item() + train_reg_loss += batch_reg_loss + + + self.model_engine.backward(loss) + self.model_engine.step() + + param_updates = None + if self._tensorboard.should_log_histograms_this_batch() and self._master: + # Get the magnitude of parameter updates for logging. We need to do some + # computation before and after the optimizer step, and it's expensive because of + # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so + # we don't do this every batch, only when it's requested. + param_updates = { + name: param.detach().cpu().clone() + for name, param in self.model.named_parameters() + } + + if self._scaler is not None: + self._scaler.step(self.optimizer) + self._scaler.update() + else: + self.optimizer.step() + + for name, param in self.model.named_parameters(): + param_updates[name].sub_(param.detach().cpu()) + else: + if self._scaler is not None: + self._scaler.step(self.optimizer) + self._scaler.update() + else: + self.optimizer.step() + + # Update moving averages + if self._moving_average is not None: + self._moving_average.apply(batch_num_total) + + # Update the description with the latest metrics + metrics = training_util.get_metrics( + self.model, + train_loss, + train_reg_loss, + batch_loss, + batch_reg_loss, + batches_this_epoch, + world_size=self._world_size, + cuda_device=self.cuda_device, + ) + + if self._master: + # Updating tqdm only for the master as the trainers wouldn't have one + description = training_util.description_from_metrics(metrics) + batch_group_generator_tqdm.set_description(description, refresh=False) + self._tensorboard.log_batch( + self.model, + self.optimizer, + 0., # batch_grad_norm, + metrics, + batch_group, + param_updates, + ) + + self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) + + for callback in self._batch_callbacks: + callback( + self, + batch_group, + batch_group_outputs, + epoch, + batches_this_epoch, + is_training=True, + is_master=self._master, + ) + + metrics = training_util.get_metrics( + self.model, + train_loss, + train_reg_loss, + batch_loss=None, + batch_reg_loss=None, + num_batches=batches_this_epoch, + reset=True, + world_size=self._world_size, + cuda_device=self.cuda_device, + ) + + for (worker, memory) in cpu_memory_usage: + metrics["worker_" + str(worker) + "_memory_MB"] = memory + for (gpu_num, memory) in gpu_memory_usage: + metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory + return metrics + + def _validation_loss(self, epoch: int) -> Tuple[float, float, int]: + """ + Computes the validation loss. Returns it and the number of batches. + """ + logger.info("Validating") + + self.model_engine.eval() + + # Replace parameter values with the shadow values from the moving averages. + if self._moving_average is not None: + self._moving_average.assign_average_value() + + if self._validation_data_loader is not None: + validation_data_loader = self._validation_data_loader + else: + raise ConfigurationError( + "Validation results cannot be calculated without a validation_data_loader" + ) + + regularization_penalty = self.model.get_regularization_penalty() + + # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's + # progress is shown + if self._master: + val_generator_tqdm = Tqdm.tqdm(validation_data_loader) + else: + val_generator_tqdm = validation_data_loader + + batches_this_epoch = 0 + val_loss = 0 + val_batch_loss = 0 + if regularization_penalty is not None: + val_reg_loss = 0 + val_batch_reg_loss = 0 + else: + val_reg_loss = None + val_batch_reg_loss = None + done_early = False + for batch in val_generator_tqdm: + if self._distributed: + # Check whether the other workers have stopped already (due to differing amounts of + # data in each). If so, we can't proceed because we would hang when we hit the + # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor + # here because NCCL process groups apparently don't support BoolTensor. + done = torch.tensor(0, device=self.cuda_device) + torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) + if done.item() > 0: + done_early = True + logger.warning( + f"Worker {torch.distributed.get_rank()} finishing validation early! " + "This implies that there is an imbalance in your validation " + "data across the workers and that some amount of it will be " + "ignored. A small amount of this is fine, but a major imbalance " + "should be avoided. Note: This warning will appear unless your " + "data is perfectly balanced." + ) + break + + with amp.autocast(self._use_amp): + batch_outputs = self.batch_outputs(batch, for_training=False) + loss = batch_outputs.get("loss") + reg_loss = batch_outputs.get("reg_loss") + if loss is not None: + # You shouldn't necessarily have to compute a loss for validation, so we allow for + # `loss` to be None. We need to be careful, though - `batches_this_epoch` is + # currently only used as the divisor for the loss function, so we can safely only + # count those batches for which we actually have a loss. If this variable ever + # gets used for something else, we might need to change things around a bit. + batches_this_epoch += 1 + val_batch_loss = loss.detach().cpu().numpy() + val_loss += val_batch_loss + if reg_loss is not None: + val_batch_reg_loss = reg_loss.detach().cpu().numpy() + val_reg_loss += val_batch_reg_loss + + # Update the description with the latest metrics + val_metrics = training_util.get_metrics( + self.model, + val_loss, + val_reg_loss, + val_batch_loss, + val_batch_reg_loss, + batches_this_epoch, + world_size=self._world_size, + cuda_device=self.cuda_device, + ) + + description = training_util.description_from_metrics(val_metrics) + if self._master: + val_generator_tqdm.set_description(description, refresh=False) + + for callback in self._batch_callbacks: + callback( + self, + [batch], + [batch_outputs], + epoch, + batches_this_epoch, + is_training=False, + is_master=self._master, + ) + + if self._distributed and not done_early: + logger.warning( + f"Worker {torch.distributed.get_rank()} completed its entire epoch (validation)." + ) + # Indicate that we're done so that any workers that have remaining data stop validation early. + done = torch.tensor(1, device=self.cuda_device) + torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) + assert done.item() + + # Now restore the original parameter values. + if self._moving_average is not None: + self._moving_average.restore() + + return val_loss, val_reg_loss, batches_this_epoch + + def train(self) -> Dict[str, Any]: + """ + Trains the supplied model with the supplied parameters. + """ + try: + epoch_counter = self._restore_checkpoint() + except RuntimeError: + traceback.print_exc() + raise ConfigurationError( + "Could not recover training from the checkpoint. Did you mean to output to " + "a different serialization directory or delete the existing serialization " + "directory?" + ) + + training_util.enable_gradient_clipping(self.model, self._grad_clipping) + + logger.info("Beginning training.") + + val_metrics: Dict[str, float] = {} + this_epoch_val_metric: float = None + metrics: Dict[str, Any] = {} + epochs_trained = 0 + training_start_time = time.time() + + metrics["best_epoch"] = self._metric_tracker.best_epoch + for key, value in self._metric_tracker.best_epoch_metrics.items(): + metrics["best_validation_" + key] = value + + for callback in self._epoch_callbacks: + callback(self, metrics={}, epoch=-1, is_master=self._master) + + for epoch in range(epoch_counter, self._num_epochs): + epoch_start_time = time.time() + train_metrics = self._train_epoch(epoch) + + # get peak of memory usage + for key, value in train_metrics.items(): + if key.startswith("gpu_") and key.endswith("_memory_MB"): + metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) + elif key.startswith("worker_") and key.endswith("_memory_MB"): + metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) + + if self._validation_data_loader is not None: + with torch.no_grad(): + # We have a validation set, so compute all the metrics on it. + val_loss, val_reg_loss, num_batches = self._validation_loss(epoch) + + # It is safe again to wait till the validation is done. This is + # important to get the metrics right. + if self._distributed: + dist.barrier() + + val_metrics = training_util.get_metrics( + self.model, + val_loss, + val_reg_loss, + batch_loss=None, + batch_reg_loss=None, + num_batches=num_batches, + reset=True, + world_size=self._world_size, + cuda_device=self.cuda_device, + ) + + # Check validation metric for early stopping + this_epoch_val_metric = val_metrics[self._validation_metric] + self._metric_tracker.add_metric(this_epoch_val_metric) + + if self._metric_tracker.should_stop_early(): + logger.info("Ran out of patience. Stopping training.") + break + + if self._master: + self._tensorboard.log_metrics( + train_metrics, val_metrics=val_metrics, log_to_console=True, epoch=epoch + 1 + ) # +1 because tensorboard doesn't like 0 + + # Create overall metrics dict + training_elapsed_time = time.time() - training_start_time + metrics["training_duration"] = str(datetime.timedelta(seconds=training_elapsed_time)) + metrics["training_start_epoch"] = epoch_counter + metrics["training_epochs"] = epochs_trained + metrics["epoch"] = epoch + + for key, value in train_metrics.items(): + metrics["training_" + key] = value + for key, value in val_metrics.items(): + metrics["validation_" + key] = value + + if self._metric_tracker.is_best_so_far(): + # Update all the best_ metrics. + # (Otherwise they just stay the same as they were.) + metrics["best_epoch"] = epoch + for key, value in val_metrics.items(): + metrics["best_validation_" + key] = value + + self._metric_tracker.best_epoch_metrics = val_metrics + + if self._serialization_dir and self._master: + common_util.dump_metrics( + os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), metrics + ) + + + if self._master: + self._checkpointer.save_checkpoint( + epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() + ) + + # Wait for the master to finish saving the checkpoint + if self._distributed: + dist.barrier() + + for callback in self._epoch_callbacks: + callback(self, metrics=metrics, epoch=epoch, is_master=self._master) + + epoch_elapsed_time = time.time() - epoch_start_time + logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time)) + + if epoch < self._num_epochs - 1: + training_elapsed_time = time.time() - training_start_time + estimated_time_remaining = training_elapsed_time * ( + (self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1 + ) + formatted_time = str(datetime.timedelta(seconds=int(estimated_time_remaining))) + logger.info("Estimated training time remaining: %s", formatted_time) + + epochs_trained += 1 + + # make sure pending events are flushed to disk and files are closed properly + self._tensorboard.close() + + # Load the best model state before returning + best_model_state = self._checkpointer.best_model_state() + if best_model_state: + self.model.load_state_dict(best_model_state) + + return metrics + + @contextmanager + def get_checkpoint_state(self) -> Iterator[Tuple[Dict[str, Any], Dict[str, Any]]]: + if self._moving_average is not None: + # Assigning average value to model parameters. The checkpointer will call + # `restore_state_after_checkpointing` when it is done to put this back to what it was. + self._moving_average.assign_average_value() + + model_state = self.model.state_dict() + + # These are the training states we need to persist. + training_states = { + "metric_tracker": self._metric_tracker.state_dict(), + "optimizer": self.optimizer.state_dict(), + "batch_num_total": self._batch_num_total, + } + + try: + yield model_state, training_states + finally: + if self._moving_average is not None: + self._moving_average.restore() + + def _restore_checkpoint(self) -> int: + """ + Restores the model and training state from the last saved checkpoint. + This includes an epoch count and optimizer state, which is serialized separately + from model parameters. This function should only be used to continue training - + if you wish to load a model for inference/load parts of a model into a new + computation graph, you should use the native Pytorch functions: + ` model.load_state_dict(torch.load("/path/to/model/weights.th"))` + If `self._serialization_dir` does not exist or does not contain any checkpointed weights, + this function will do nothing and return 0. + # Returns + epoch: `int` + The epoch at which to resume training, which should be one after the epoch + in the saved training state. + """ + model_state, training_state = self._checkpointer.restore_checkpoint() + + if not training_state: + # No checkpoint to restore, start at 0 + return 0 + + self.model.load_state_dict(model_state) + self.optimizer.load_state_dict(training_state["optimizer"]) + training_util.move_optimizer_to_cuda(self.optimizer) + + # Currently the `training_state` contains a serialized `MetricTracker`. + if "metric_tracker" in training_state: + self._metric_tracker.load_state_dict(training_state["metric_tracker"]) + # It used to be the case that we tracked `val_metric_per_epoch`. + elif "val_metric_per_epoch" in training_state: + self._metric_tracker.clear() + self._metric_tracker.add_metrics(training_state["val_metric_per_epoch"]) + # And before that we didn't track anything. + else: + self._metric_tracker.clear() + + if isinstance(training_state["epoch"], int): + epoch_to_return = training_state["epoch"] + 1 + else: + epoch_to_return = int(training_state["epoch"].split(".")[0]) + 1 + + # For older checkpoints with batch_num_total missing, default to old behavior where + # it is unchanged. + batch_num_total = training_state.get("batch_num_total") + if batch_num_total is not None: + self._batch_num_total = batch_num_total + + return epoch_to_return + + @classmethod + def from_partial_objects( + cls, + model: Model, + serialization_dir: str, + data_loader: DataLoader, + validation_data_loader: DataLoader = None, + local_rank: int = 0, + patience: int = None, + validation_metric: str = "-loss", + num_epochs: int = 20, + cuda_device: Optional[Union[int, torch.device]] = None, + grad_norm: float = None, + grad_clipping: float = None, + distributed: bool = None, + world_size: int = 1, + num_gradient_accumulation_steps: int = 1, + use_amp: bool = False, + no_grad: List[str] = None, + optimizer: Lazy[Optimizer] = None, + tensorboard_writer: Lazy[TensorboardWriter] = None, + moving_average: Lazy[MovingAverage] = None, + checkpointer: Lazy[Checkpointer] = None, + batch_callbacks: List[BatchCallback] = None, + epoch_callbacks: List[EpochCallback] = None, + deepspeed_config: DeepspeedConfig = None + ) -> "Trainer": + """ + This method exists so that we can have a documented method to construct this class using + `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this + method. + The reason we can't just use `__init__` with `FromParams` here is because there are + sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type + annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to + have the parameters from the `Model` before it's constructed, and the `Schedulers` need to + have the `Optimizer`. Because of this, the typical way we construct things `FromParams` + doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. + If you're not using `FromParams`, you can just construct these arguments in the right order + yourself in your code and call the constructor directly. + """ + if cuda_device is None: + from torch import cuda + + if cuda.device_count() > 0: + cuda_device = 0 + else: + cuda_device = -1 + + check_for_gpu(cuda_device) + if cuda_device >= 0: + # Moving model to GPU here so that the optimizer state gets constructed on + # the right device. + model = model.cuda(cuda_device) + + if no_grad: + for name, parameter in model.named_parameters(): + if any(re.search(regex, name) for regex in no_grad): + parameter.requires_grad_(False) + + parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] + optimizer_ = optimizer.construct(model_parameters=parameters) + if not optimizer_: + optimizer_ = Optimizer.default(parameters) + + common_util.log_frozen_and_tunable_parameter_names(model) + + batches_per_epoch: Optional[int] + try: + batches_per_epoch = len(data_loader) + batches_per_epoch = math.ceil(batches_per_epoch / num_gradient_accumulation_steps) + except TypeError: + batches_per_epoch = None + + moving_average_ = moving_average.construct(parameters=parameters) + + checkpointer_ = checkpointer.construct() or Checkpointer(serialization_dir) + tensorboard_writer_ = tensorboard_writer.construct() or TensorboardWriter(serialization_dir) + + return cls( + model, + optimizer_, + data_loader, + patience=patience, + validation_metric=validation_metric, + validation_data_loader=validation_data_loader, + num_epochs=num_epochs, + serialization_dir=serialization_dir, + cuda_device=cuda_device, + grad_norm=grad_norm, + grad_clipping=grad_clipping, + tensorboard_writer=tensorboard_writer_, + checkpointer=checkpointer_, + moving_average=moving_average_, + batch_callbacks=batch_callbacks, + epoch_callbacks=epoch_callbacks, + distributed=distributed, + local_rank=local_rank, + world_size=world_size, + num_gradient_accumulation_steps=num_gradient_accumulation_steps, + use_amp=use_amp, + deepspeed_config=deepspeed_config + ) \ No newline at end of file diff --git a/allennlp/training/metrics/categorical_accuracy.py b/allennlp/training/metrics/categorical_accuracy.py index 1366b1476ee..1474693b1ae 100644 --- a/allennlp/training/metrics/categorical_accuracy.py +++ b/allennlp/training/metrics/categorical_accuracy.py @@ -95,7 +95,7 @@ def __call__( correct *= mask.view(-1, 1) _total_count = mask.sum() else: - _total_count = torch.tensor(gold_labels.numel()) + _total_count = torch.tensor(gold_labels.numel(), device=gold_labels.device) _correct_count = correct.sum() if is_distributed(): From 619657e924e013f4b957d2729c05a8f9f81b3dae Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Fri, 2 Oct 2020 11:31:21 -0400 Subject: [PATCH 02/20] delegating grad_clipping, grad_norm, grad_acculumation, etc. to deepspeed --- allennlp/training/deepspeed_trainer.py | 154 ++++++++----------------- 1 file changed, 48 insertions(+), 106 deletions(-) diff --git a/allennlp/training/deepspeed_trainer.py b/allennlp/training/deepspeed_trainer.py index 6ef41b27f6b..40f8848da6a 100644 --- a/allennlp/training/deepspeed_trainer.py +++ b/allennlp/training/deepspeed_trainer.py @@ -22,7 +22,6 @@ import torch.distributed as dist from torch.cuda import amp from torch.nn.parallel import DistributedDataParallel -from torch.nn.utils import clip_grad_norm_ import deepspeed @@ -121,7 +120,6 @@ class DeepspeedTrainer(Trainer): def __init__( self, model: Model, - optimizer: torch.optim.Optimizer, data_loader: DataLoader, deepspeed_config: DeepspeedConfig, patience: Optional[int] = None, @@ -131,8 +129,6 @@ def __init__( serialization_dir: Optional[str] = None, checkpointer: Checkpointer = None, cuda_device: Optional[Union[int, torch.device]] = None, - grad_norm: Optional[float] = None, - grad_clipping: Optional[float] = None, tensorboard_writer: TensorboardWriter = None, moving_average: Optional[MovingAverage] = None, batch_callbacks: List[BatchCallback] = None, @@ -151,7 +147,6 @@ def __init__( self.data_loader = data_loader self._validation_data_loader = validation_data_loader - self.optimizer = optimizer if patience is None: # no early stopping if validation_data_loader is not None: @@ -177,9 +172,6 @@ def __init__( else: self._checkpointer = Checkpointer(serialization_dir) - self._grad_norm = grad_norm - self._grad_clipping = grad_clipping - self._moving_average = moving_average self._batch_callbacks = batch_callbacks or [] self._epoch_callbacks = epoch_callbacks or [] @@ -197,30 +189,21 @@ def __init__( self._num_gradient_accumulation_steps = num_gradient_accumulation_steps - # Enable automatic mixed precision training. - self._scaler: Optional[amp.GradScaler] = None - self._use_amp = use_amp - if self._use_amp: - if self.cuda_device == torch.device("cpu"): - raise ValueError("Using AMP requires a cuda device") - self._scaler = amp.GradScaler() - self._pytorch_model = self.model self._ds_config = deepspeed_config - self.model_engine, self.ds_optimizer, _, _ = self._ds_config.launch( + self.model_engine, self.optimizer, _, _ = self._ds_config.launch( self.model, - None, # self.optimizer, + None, local_rank, serialization_dir, self.data_loader.batch_size, num_gradient_accumulation_steps ) - def mute_log(*args, **kwargs): - pass - if hasattr(self.model_engine, 'timers'): + def mute_log(*args, **kwargs): + pass self.model_engine.timers.log = mute_log def batch_outputs(self, batch: TensorDict, for_training: bool) -> Dict[str, torch.Tensor]: @@ -280,27 +263,19 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: # Get tqdm for the training batches batch_generator = iter(self.data_loader) - batch_group_generator = common_util.lazy_groups_of( - batch_generator, self._num_gradient_accumulation_steps - ) logger.info("Training") num_training_batches: Union[int, float] - try: - len_data_loader = len(self.data_loader) - num_training_batches = math.ceil( - len_data_loader / self._num_gradient_accumulation_steps - ) - except TypeError: - num_training_batches = float("inf") - + len_data_loader = len(self.data_loader) + num_training_batches = len_data_loader + # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown - batch_group_generator_tqdm = batch_group_generator + batch_generator_tqdm = batch_generator if self._master: - batch_group_generator_tqdm = Tqdm.tqdm( - batch_group_generator, total=num_training_batches + batch_generator_tqdm = Tqdm.tqdm( + batch_generator, total=num_training_batches ) self._last_log = time.time() @@ -310,34 +285,26 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: self._batch_num_total = 0 done_early = False - for batch_group in batch_group_generator_tqdm: + for batch in batch_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total - self.optimizer.zero_grad() - - batch_group_outputs = [] - for batch in batch_group: - with amp.autocast(self._use_amp): - batch_outputs = self.batch_outputs(batch, for_training=True) - batch_group_outputs.append(batch_outputs) - loss = batch_outputs.get("loss") - reg_loss = batch_outputs.get("reg_loss") - if torch.isnan(loss): - raise ValueError("nan loss encountered") - loss = loss / len(batch_group) - - batch_loss = loss.item() - train_loss += batch_loss - if reg_loss is not None: - reg_loss = reg_loss / len(batch_group) - batch_reg_loss = reg_loss.item() - train_reg_loss += batch_reg_loss - - - self.model_engine.backward(loss) - self.model_engine.step() + batch_outputs = self.batch_outputs(batch, for_training=True) + loss = batch_outputs.get("loss") + reg_loss = batch_outputs.get("reg_loss") + if torch.isnan(loss): + raise ValueError("nan loss encountered") + + batch_loss = loss.item() + train_loss += batch_loss + if reg_loss is not None: + batch_reg_loss = reg_loss.item() + train_reg_loss += batch_reg_loss + + + self.model_engine.backward(loss) + self.model_engine.step() param_updates = None if self._tensorboard.should_log_histograms_this_batch() and self._master: @@ -350,20 +317,8 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: for name, param in self.model.named_parameters() } - if self._scaler is not None: - self._scaler.step(self.optimizer) - self._scaler.update() - else: - self.optimizer.step() - for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) - else: - if self._scaler is not None: - self._scaler.step(self.optimizer) - self._scaler.update() - else: - self.optimizer.step() # Update moving averages if self._moving_average is not None: @@ -384,13 +339,13 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: if self._master: # Updating tqdm only for the master as the trainers wouldn't have one description = training_util.description_from_metrics(metrics) - batch_group_generator_tqdm.set_description(description, refresh=False) + batch_generator_tqdm.set_description(description, refresh=False) self._tensorboard.log_batch( self.model, self.optimizer, 0., # batch_grad_norm, metrics, - batch_group, + batch, param_updates, ) @@ -399,8 +354,8 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: for callback in self._batch_callbacks: callback( self, - batch_group, - batch_group_outputs, + batch, + batch_outputs, epoch, batches_this_epoch, is_training=True, @@ -483,22 +438,21 @@ def _validation_loss(self, epoch: int) -> Tuple[float, float, int]: ) break - with amp.autocast(self._use_amp): - batch_outputs = self.batch_outputs(batch, for_training=False) - loss = batch_outputs.get("loss") - reg_loss = batch_outputs.get("reg_loss") - if loss is not None: - # You shouldn't necessarily have to compute a loss for validation, so we allow for - # `loss` to be None. We need to be careful, though - `batches_this_epoch` is - # currently only used as the divisor for the loss function, so we can safely only - # count those batches for which we actually have a loss. If this variable ever - # gets used for something else, we might need to change things around a bit. - batches_this_epoch += 1 - val_batch_loss = loss.detach().cpu().numpy() - val_loss += val_batch_loss - if reg_loss is not None: - val_batch_reg_loss = reg_loss.detach().cpu().numpy() - val_reg_loss += val_batch_reg_loss + batch_outputs = self.batch_outputs(batch, for_training=False) + loss = batch_outputs.get("loss") + reg_loss = batch_outputs.get("reg_loss") + if loss is not None: + # You shouldn't necessarily have to compute a loss for validation, so we allow for + # `loss` to be None. We need to be careful, though - `batches_this_epoch` is + # currently only used as the divisor for the loss function, so we can safely only + # count those batches for which we actually have a loss. If this variable ever + # gets used for something else, we might need to change things around a bit. + batches_this_epoch += 1 + val_batch_loss = loss.detach().cpu().numpy() + val_loss += val_batch_loss + if reg_loss is not None: + val_batch_reg_loss = reg_loss.detach().cpu().numpy() + val_reg_loss += val_batch_reg_loss # Update the description with the latest metrics val_metrics = training_util.get_metrics( @@ -556,8 +510,6 @@ def train(self) -> Dict[str, Any]: "directory?" ) - training_util.enable_gradient_clipping(self.model, self._grad_clipping) - logger.info("Beginning training.") val_metrics: Dict[str, float] = {} @@ -725,6 +677,7 @@ def _restore_checkpoint(self) -> int: return 0 self.model.load_state_dict(model_state) + # self.model_engine.load_checkpoint() self.optimizer.load_state_dict(training_state["optimizer"]) training_util.move_optimizer_to_cuda(self.optimizer) @@ -764,12 +717,9 @@ def from_partial_objects( validation_metric: str = "-loss", num_epochs: int = 20, cuda_device: Optional[Union[int, torch.device]] = None, - grad_norm: float = None, - grad_clipping: float = None, distributed: bool = None, world_size: int = 1, num_gradient_accumulation_steps: int = 1, - use_amp: bool = False, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, tensorboard_writer: Lazy[TensorboardWriter] = None, @@ -811,11 +761,6 @@ def from_partial_objects( if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) - parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] - optimizer_ = optimizer.construct(model_parameters=parameters) - if not optimizer_: - optimizer_ = Optimizer.default(parameters) - common_util.log_frozen_and_tunable_parameter_names(model) batches_per_epoch: Optional[int] @@ -825,6 +770,7 @@ def from_partial_objects( except TypeError: batches_per_epoch = None + parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] moving_average_ = moving_average.construct(parameters=parameters) checkpointer_ = checkpointer.construct() or Checkpointer(serialization_dir) @@ -832,16 +778,14 @@ def from_partial_objects( return cls( model, - optimizer_, data_loader, + deepspeed_config=deepspeed_config, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_data_loader, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, - grad_norm=grad_norm, - grad_clipping=grad_clipping, tensorboard_writer=tensorboard_writer_, checkpointer=checkpointer_, moving_average=moving_average_, @@ -851,6 +795,4 @@ def from_partial_objects( local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, - use_amp=use_amp, - deepspeed_config=deepspeed_config ) \ No newline at end of file From a329fd26fe66d522fae7c594761410caa998b31d Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Fri, 2 Oct 2020 12:07:48 -0400 Subject: [PATCH 03/20] cleaning up deepspeed config interface --- allennlp/training/deepspeed_trainer.py | 31 ++++++++------------------ 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/allennlp/training/deepspeed_trainer.py b/allennlp/training/deepspeed_trainer.py index 40f8848da6a..ecd9998719e 100644 --- a/allennlp/training/deepspeed_trainer.py +++ b/allennlp/training/deepspeed_trainer.py @@ -62,17 +62,11 @@ def __init__( self.wall_clock_breakdown = wall_clock_breakdown @staticmethod - def build_deepspeed_args(deepspeed_config_path: str, local_rank: int = 0): - from argparse import ArgumentParser, Namespace - parser = ArgumentParser() - parser.add_argument('--local_rank', type=int, default=local_rank) - parser = deepspeed.add_config_arguments(parser) + def build_deepspeed_args(local_rank: int = 0): + from argparse import Namespace - args, _ = parser.parse_known_args() - arg_dict = vars(args) - - arg_dict.update(dict(deepspeed_config=deepspeed_config_path, deepspeed=True, local_rank=local_rank)) - return Namespace(**arg_dict) + args = dict(deepspeed_config=deepspeed_config_path, deepspeed=True, local_rank=local_rank) + return Namespace(**args) @property def config(self): @@ -84,15 +78,6 @@ def config(self): # } return vars(self) - def _to_temp_file(self, serialization_dir, **kwargs): - fd, path = tempfile.mkstemp(dir=serialization_dir) - - config = {**self.config, **kwargs} - with os.fdopen(fd, 'w') as f: - f.write(json.dumps(config)) - - return path - def launch( self, model: torch.nn.Module, @@ -103,16 +88,18 @@ def launch( gradient_accumulation_steps: int, **kwargs ): - path = self._to_temp_file(serialization_dir, train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) + path = '' + config = dict(**self.config, train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) ds = deepspeed.initialize( args=self.build_deepspeed_args(path, local_rank), model=model, model_parameters=model.parameters(), dist_init_required=False, + config_params=config, **kwargs ) - os.remove(path) + # os.remove(path) return ds @Trainer.register("deepspeed", constructor="from_partial_objects") @@ -711,6 +698,7 @@ def from_partial_objects( model: Model, serialization_dir: str, data_loader: DataLoader, + deepspeed_config: DeepspeedConfig, validation_data_loader: DataLoader = None, local_rank: int = 0, patience: int = None, @@ -727,7 +715,6 @@ def from_partial_objects( checkpointer: Lazy[Checkpointer] = None, batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, - deepspeed_config: DeepspeedConfig = None ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using From 00666c2e351249e6915c5dcd57e99edac817eece Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Wed, 7 Oct 2020 12:33:47 -0400 Subject: [PATCH 04/20] idenifying bottleneck / start simplifying model engine --- allennlp/training/deepspeed_engine_adapter.py | 1130 +++++++++++++++++ allennlp/training/deepspeed_trainer.py | 62 +- 2 files changed, 1158 insertions(+), 34 deletions(-) create mode 100644 allennlp/training/deepspeed_engine_adapter.py diff --git a/allennlp/training/deepspeed_engine_adapter.py b/allennlp/training/deepspeed_engine_adapter.py new file mode 100644 index 00000000000..e28b39d3240 --- /dev/null +++ b/allennlp/training/deepspeed_engine_adapter.py @@ -0,0 +1,1130 @@ +''' +Copyright 2019 The Microsoft DeepSpeed Team +''' + +import os +import torch +import warnings +import torch.distributed as dist + +import apex +from apex import amp +from torch.nn.modules import Module +from torch.distributed.distributed_c10d import _get_global_rank + +from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer +from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1 +from deepspeed.runtime.zero.utils import is_zero_supported_optimizer +from deepspeed.runtime.activation_checkpointing import checkpointing as activation_checkpointing +from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer +from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer +from deepspeed.runtime.config import DeepSpeedConfig, \ + ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, DEEPSPEED_ADAM, DEEPSPEED_OPTIMIZERS +from deepspeed.runtime.dataloader import DeepSpeedDataLoader +from deepspeed.runtime.constants import \ + ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \ + TORCH_DISTRIBUTED_DEFAULT_PORT +from deepspeed.runtime.zero.constants import \ + ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS +from deepspeed.runtime.csr_tensor import CSRTensor +import deepspeed.runtime.lr_schedules as lr_schedules +from deepspeed.utils import logger, log_dist + +MEMORY_OPT_ALLREDUCE_SIZE = 500000000 +SUMMARY_WRITER_DIR_NAME = "JobId" + +try: + from apex_C import flatten + from apex_C import unflatten +except ImportError: + try: + _ = warned_flatten + except NameError: + logger.warning( + "Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten." + ) + warned_flatten = True + from torch._utils import _flatten_dense_tensors as flatten + from torch._utils import _unflatten_dense_tensors as unflatten + + +class DummyTimer: + class Timer: + def __init__(self, name): + pass + + def start(self): + pass + + def stop(self): + pass + + def reset(self): + pass + + def __init__(self): + self.timers = {} + + def __call__(self, name): + if name not in self.timers: + self.timers[name] = self.Timer(name) + return self.timers[name] + + def log(self, *args, **kwargs): + pass + + + +def split_half_float_double_csr(tensors): + dtypes = [ + "torch.cuda.HalfTensor", + "torch.cuda.FloatTensor", + "torch.cuda.DoubleTensor", + CSRTensor.type() + ] + buckets = [] + for i, dtype in enumerate(dtypes): + bucket = [t for t in tensors if t.type() == dtype] + if bucket: + buckets.append((dtype, bucket)) + return buckets + + +def _initialize_parameter_parallel_groups(parameter_parallel_size=None): + data_parallel_size = int(dist.get_world_size()) + if parameter_parallel_size is None: + parameter_parallel_size = int(data_parallel_size) + logger.info("data_parallel_size: %s, parameter_parallel_size: %s", + data_parallel_size, + parameter_parallel_size) + assert data_parallel_size % parameter_parallel_size == 0, \ + 'world size should be divisible by parameter parallel size' + rank = dist.get_rank() + my_group = None + for i in range(dist.get_world_size() // parameter_parallel_size): + ranks = range(i * parameter_parallel_size, (i + 1) * parameter_parallel_size) + group = torch.distributed.new_group(ranks) + if rank in ranks: + my_group = group + return my_group + + +class AllennlpDeepSpeedEngineAdapter(Module): + r"""DeepSpeed engine for training. + """ + def __init__(self, + args, + model, + optimizer=None, + model_parameters=None, + training_data=None, + lr_scheduler=None, + mpu=None, + dist_init_required=None, + collate_fn=None, + config_params=None): + super().__init__() + self.client_optimizer = optimizer + self.client_model_parameters = model_parameters + self.client_lr_scheduler = lr_scheduler + self.training_data = training_data + self.collate_fn = collate_fn + self.mpu = mpu + self.data_parallel_group = None + self.global_steps = 0 + self.global_samples = 0 + self.micro_steps = 0 + self.skipped_steps = 0 + self.gradient_average = True + self.warn_unscaled_loss = True + self.config_params = config_params + self.loaded_checkpoint_mp_world_size = None + self.loaded_checkpoint_dp_world_size = None + self.enable_backward_allreduce = True + + if dist_init_required is None: + dist_init_required = not dist.is_initialized() + + self.dist_backend = "nccl" + if dist_init_required: + if not dist.is_initialized(): + logger.info("Initializing torch distributed with backend: {}".format( + self.dist_backend)) + dist.init_process_group(backend=self.dist_backend) + else: + logger.warning( + "Was given dist_init_required=True but detected that torch" + "distributed was already initialized, cannot initialize twice.") + + self._configure_with_arguments(args, mpu) + + self._init_distributed(dist_init_required) + + # Configure distributed model + self._configure_distributed_model(model) + + if training_data: + self.training_dataloader = self.deepspeed_io(training_data) + else: + self.training_dataloader = None + + # Configure optimizer and scheduler + self.optimizer = None + self.lr_scheduler = None + if model_parameters or optimizer: + self._configure_optimizer(optimizer, model_parameters) + self._configure_lr_scheduler(lr_scheduler) + self._report_progress(0) + + # Bookkeeping for csr support + self.csr_tensor_module_names = set() + if self.sparse_gradients_enabled(): + for name, module in self.module.named_modules(): + if isinstance(module, torch.nn.Embedding): + self.csr_tensor_module_names.add(name + ".weight") + logger.info("Will convert {} to sparse (csr) " + "tensor during training".format(name)) + + self.save_non_zero_checkpoint = False + self.save_zero_checkpoint = False + self._configure_checkpointing(dist_init_required) + + if self.global_rank == 0: + self._config.print('DeepSpeedLight configuration') + + def wall_clock_breakdown(self): + return self._config.wall_clock_breakdown + + def memory_breakdown(self): + return self._config.memory_breakdown + + def sparse_gradients_enabled(self): + return self._config.sparse_gradients_enabled + + def train_batch_size(self): + return self._config.train_batch_size + + def train_micro_batch_size_per_gpu(self): + return self._config.train_micro_batch_size_per_gpu + + def optimizer_name(self): + return self.client_optimizer.__class__.__name__ if self.client_optimizer else self._config.optimizer_name + + def optimizer_params(self): + return self._config.optimizer_params + + def optimizer_legacy_fusion(self): + return self._config.optimizer_legacy_fusion + + def scheduler_name(self): + return self._config.scheduler_name + + def scheduler_params(self): + return self._config.scheduler_params + + def zero_optimization(self): + return self._config.zero_enabled + + def zero_allow_untested_optimizer(self): + return self._config.zero_allow_untested_optimizer + + def zero_reduce_scatter(self): + return self._config.zero_config.reduce_scatter + + def zero_overlap_comm(self): + return self._config.zero_config.overlap_comm + + def zero_cpu_offload(self): + return self._config.zero_config.cpu_offload + + def zero_optimization_stage(self): + return self._config.zero_optimization_stage + + def zero_reduce_bucket_size(self): + return self._config.zero_config.reduce_bucket_size + + def zero_allgather_bucket_size(self): + return self._config.zero_config.allgather_bucket_size + + def zero_optimization_partition_gradients(self): + return self.zero_optimization_stage() >= ZERO_OPTIMIZATION_GRADIENTS + + def zero_contiguous_gradients(self): + return self._config.zero_config.contiguous_gradients + + def zero_load_from_fp32_weights(self): + return self._config.zero_config.load_from_fp32_weights + + def fp16_enabled(self): + return self._config.fp16_enabled + + def amp_enabled(self): + return self._config.amp_enabled + + def amp_params(self): + return self._config.amp_params + + def loss_scale(self): + return self._config.loss_scale + + def gradient_accumulation_steps(self): + return self._config.gradient_accumulation_steps + + def allreduce_always_fp32(self): + return self._config.allreduce_always_fp32 + + def postscale_gradients(self): + return not self._config.prescale_gradients + + def gradient_predivide_factor(self): + return self._config.gradient_predivide_factor + + def steps_per_print(self): + return self._config.steps_per_print + + def zero_allgather_partitions(self): + return self._config.zero_config.allgather_partitions + + def dump_state(self): + return self._config.dump_state + + def gradient_clipping(self): + return self._config.gradient_clipping + + def dynamic_loss_scale(self): + return self._config.loss_scale == 0 + + def initial_dynamic_scale(self): + return self._config.initial_dynamic_scale + + def dynamic_loss_scale_args(self): + return self._config.dynamic_loss_scale_args + + def _configure_lr_scheduler(self, client_lr_scheduler): + # First check for scheduler in json configuration + lr_scheduler = self._scheduler_from_config(self.optimizer) + if lr_scheduler: + if self.global_rank == 0: + logger.info( + f'DeepSpeed using configured LR scheduler = {self.scheduler_name()}') + self.lr_scheduler = lr_scheduler + else: + if self.global_rank == 0: + logger.info('DeepSpeed using client LR scheduler') + self.lr_scheduler = client_lr_scheduler + log_dist(f'DeepSpeed LR Scheduler = {self.lr_scheduler}', ranks=[0]) + + def _configure_checkpointing(self, dist_init_required): + + dp_rank = self.global_rank + if self.mpu: + dp_rank = self.mpu.get_data_parallel_rank() + + # only the first data parallel process needs to store the model checkpoint + self.save_non_zero_checkpoint = (dp_rank == 0) + + if self.zero_optimization(): + param_rank = torch.distributed.get_rank( + group=self.optimizer.dp_process_group) + + # Only the first parameter parallel process needs to store the + # optimizer state checkpoints for zero + self.save_zero_checkpoint = (param_rank == dp_rank) + + def _scheduler_from_config(self, optimizer): + scheduler_name = self.scheduler_name() + if scheduler_name is not None: + if hasattr(lr_schedules, scheduler_name): + scheduler = getattr(lr_schedules, scheduler_name) + else: + assert hasattr(torch.optim.lr_scheduler, scheduler_name), \ + f"DeepSpeed does not recognize LR scheduler {scheduler_name}" + + scheduler = getattr(torch.optim.lr_scheduler, scheduler_name) + + scheduler_params = self.scheduler_params() + instantiated_scheduler = scheduler(optimizer, **scheduler_params) + return instantiated_scheduler + else: + return None + + def _init_distributed(self, dist_init_required): + if self.local_rank >= 0: + torch.cuda.set_device(self.local_rank) + self.device = torch.device("cuda", self.local_rank) + self.world_size = dist.get_world_size() + self.global_rank = dist.get_rank() + else: + self.world_size = 1 + self.global_rank = 0 + self.device = torch.device("cuda") + + # Configure based on command line arguments + def _configure_with_arguments(self, args, mpu): + self.local_rank = args.local_rank if hasattr(args, 'local_rank') else 0 + self._config = DeepSpeedConfig(args.deepspeed_config, + mpu, + param_dict=self.config_params) + + + def _is_supported_optimizer(self, optimizer_name): + return optimizer_name in DEEPSPEED_OPTIMIZERS or \ + getattr(torch.optim, optimizer_name, None) is not None + + def _broadcast_model(self): + for p in self.module.parameters(): + if torch.is_tensor(p): + dist.broadcast(p, + self.broadcast_src_rank, + group=self.data_parallel_group) + + def _configure_distributed_model(self, model): + self.module = model + if self.fp16_enabled(): + self.module.half() + self.module.to(self.device) + + if self.mpu is None: + self.data_parallel_group = _initialize_parameter_parallel_groups() + self.dp_world_size = dist.get_world_size() + self.mp_world_size = 1 + self.broadcast_src_rank = 0 + else: + self.data_parallel_group = self.mpu.get_data_parallel_group() + self.dp_world_size = self.mpu.get_data_parallel_world_size() + self.mp_world_size = self.mpu.get_model_parallel_world_size() + self.broadcast_src_rank = _get_global_rank( + self.mpu.get_data_parallel_group(), + 0) + + if not self.amp_enabled(): + self._broadcast_model() + + # Configure optimizer + def _configure_optimizer(self, client_optimizer, model_parameters): + + if client_optimizer is not None: + basic_optimizer = client_optimizer + if self.global_rank == 0: + logger.info('Using client Optimizer as basic optimizer') + else: + basic_optimizer = self._configure_basic_optimizer(model_parameters) + if self.global_rank == 0: + logger.info( + 'Using DeepSpeed Optimizer param name {} as basic optimizer'.format( + self.optimizer_name())) + + if self.global_rank == 0: + logger.info('DeepSpeed Basic Optimizer = {}'.format(basic_optimizer)) + + if self.zero_optimization(): + assert not self.amp_enabled(), "Amp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2" + if not is_zero_supported_optimizer(basic_optimizer): + assert self.zero_allow_untested_optimizer(), \ + 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.' + + if self.global_rank == 0: + logger.warning( + "**** You are using ZeRO with an untested optimizer, proceed with caution *****" + ) + self.optimizer = self._configure_zero_optimizer(basic_optimizer) + elif self.amp_enabled(): + assert not self.fp16_enabled(), "Cannot enable both amp with (legacy) fp16 mode" + amp_params = self.amp_params() + if self.global_rank == 0: + logger.info(f"Initializing AMP with these params: {amp_params}") + self.module, self.optimizer = amp.initialize(self.module, basic_optimizer, **amp_params) + self._broadcast_model() + elif self.fp16_enabled(): + self.optimizer = self._configure_fp16_optimizer(basic_optimizer) + else: + self.optimizer = basic_optimizer + logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer)) + logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer.state_dict())) + + def _configure_basic_optimizer(self, model_parameters): + optimizer_parameters = self.optimizer_params() + # print(optimizer_parameters.keys()) + if 'max_grad_norm' in optimizer_parameters.keys(): + raise ValueError( + "'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details" + ) + if self.optimizer_name() == ADAM_OPTIMIZER: + if self.zero_cpu_offload(): + optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters) + else: + from apex.optimizers.fused_adam import FusedAdam + optimizer = FusedAdam(model_parameters, **optimizer_parameters) + elif self.optimizer_name() == DEEPSPEED_ADAM: + from deepspeed.ops.adam import DeepSpeedCPUAdam + optimizer = DeepSpeedCPUAdam(model_parameters, **optimizer_parameters) + elif self.optimizer_name() == LAMB_OPTIMIZER: + from deepspeed.ops.lamb import FusedLamb + optimizer = FusedLamb(model_parameters, **optimizer_parameters) + elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER: + from deepspeed.runtime.fp16.onebit_adam import OnebitAdam + optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters) + else: + torch_optimizer = getattr(torch.optim, self.optimizer_name()) + optimizer = torch_optimizer(model_parameters, **optimizer_parameters) + return optimizer + + def _configure_fp16_optimizer(self, optimizer): + initial_dynamic_scale = self.initial_dynamic_scale() + dynamic_loss_args = self.dynamic_loss_scale_args() + clip_grad = self.gradient_clipping() + if isinstance(optimizer, + apex.optimizers.FusedAdam) or self.optimizer_name( + ) == ONEBIT_ADAM_OPTIMIZER: + if self.dynamic_loss_scale(): + logger.info('Creating fp16 optimizer with dynamic loss scale') + optimizer = FP16_Optimizer( + optimizer, + dynamic_loss_scale=True, + initial_dynamic_scale=initial_dynamic_scale, + dynamic_loss_args=dynamic_loss_args, + mpu=self.mpu, + clip_grad=clip_grad, + fused_adam_legacy=self.optimizer_legacy_fusion(), + timers=None) + else: + logger.info('Creating fp16 optimizer with static loss scale: {}'.format( + self.loss_scale())) + optimizer = FP16_Optimizer( + optimizer, + static_loss_scale=self.loss_scale(), + mpu=self.mpu, + clip_grad=clip_grad, + fused_adam_legacy=self.optimizer_legacy_fusion()) + else: + logger.info('Creating fp16 unfused optimizer with dynamic loss scale') + optimizer = FP16_UnfusedOptimizer( + optimizer, + dynamic_loss_scale=self.dynamic_loss_scale(), + dynamic_loss_args=dynamic_loss_args, + mpu=self.mpu, + clip_grad=clip_grad, + fused_lamb_legacy=self.optimizer_name() == LAMB_OPTIMIZER) + + return optimizer + + def _configure_zero_optimizer(self, optimizer): + zero_stage = self.zero_optimization_stage() + logger.info('Creating fp16 ZeRO stage {} optimizer'.format(zero_stage)) + + if zero_stage == ZERO_OPTIMIZATION_OPTIMIZER_STATES: + assert self.zero_reduce_scatter(), 'Stage 1 only supports reduce scatter mode' + optimizer = FP16_DeepSpeedZeroOptimizer_Stage1( + optimizer, + static_loss_scale=self.loss_scale(), + dynamic_loss_scale=self.dynamic_loss_scale(), + dynamic_loss_args=self.dynamic_loss_scale_args(), + clip_grad=self.gradient_clipping(), + all_gather_partitions=self.zero_allgather_partitions(), + allgather_size=self.zero_allgather_bucket_size(), + max_elements_per_comm=self.zero_reduce_bucket_size(), + dp_process_group=self.data_parallel_group, + mpu=self.mpu) + elif zero_stage == ZERO_OPTIMIZATION_GRADIENTS: + optimizer = FP16_DeepSpeedZeroOptimizer( + optimizer, + timers=DummyTimer(), # None, + static_loss_scale=self.loss_scale(), + dynamic_loss_scale=self.dynamic_loss_scale(), + dynamic_loss_args=self.dynamic_loss_scale_args(), + clip_grad=self.gradient_clipping(), + contiguous_gradients=self.zero_contiguous_gradients(), + reduce_bucket_size=self.zero_reduce_bucket_size(), + allgather_bucket_size=self.zero_allgather_bucket_size(), + dp_process_group=self.data_parallel_group, + reduce_scatter=self.zero_reduce_scatter(), + overlap_comm=self.zero_overlap_comm(), + cpu_offload=self.zero_cpu_offload(), + mpu=self.mpu, + postscale_gradients=self.postscale_gradients(), + gradient_predivide_factor=self.gradient_predivide_factor(), + gradient_accumulation_steps=self.gradient_accumulation_steps(), + verbose=False + ) + else: + raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage)) + + return optimizer + + def deepspeed_io(self, + dataset, + batch_size=None, + route=ROUTE_TRAIN, + pin_memory=True, + data_sampler=None, + collate_fn=None, + num_local_io_workers=None): + if not isinstance(dataset, torch.utils.data.Dataset): + raise ValueError("Training data must be a torch Dataset") + + if data_sampler is None and (route == ROUTE_PREDICT or route == ROUTE_EVAL): + data_sampler = torch.utils.data.SequentialSampler(dataset) + + if batch_size is None: + batch_size = self.train_micro_batch_size_per_gpu() + + if collate_fn is None: + collate_fn = self.collate_fn + + # If mpu is provied, forward world size and parallel rank to sampler. + data_parallel_world_size = None + data_parallel_rank = None + if self.mpu is not None: + data_parallel_world_size = self.mpu.get_data_parallel_world_size() + data_parallel_rank = self.mpu.get_data_parallel_rank() + + return DeepSpeedDataLoader(dataset=dataset, + batch_size=batch_size, + pin_memory=pin_memory, + collate_fn=collate_fn, + local_rank=self.local_rank, + tput_timer=None, + num_local_io_workers=num_local_io_workers, + data_sampler=data_sampler, + data_parallel_world_size=data_parallel_world_size, + data_parallel_rank=data_parallel_rank) + + def train(self): + r""" + """ + + self.warn_unscaled_loss = True + self.module.train() + + def eval(self): + r""" + """ + + self.warn_unscaled_loss = True + self.module.train(False) + + def _scale_loss(self, prescaled_loss): + if isinstance(prescaled_loss, torch.Tensor): + scaled_loss = prescaled_loss / self.gradient_accumulation_steps() + elif isinstance(prescaled_loss, tuple) or isinstance(prescaled_loss, list): + scaled_loss = [] + for l in prescaled_loss: + if isinstance(l, torch.Tensor): + scaled_loss.append(l / self.gradient_accumulation_steps()) + else: + scaled_loss.append(l) + else: + scaled_loss = prescaled_loss + if self.warn_unscaled_loss: + logger.warning( + f'DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}' + ) + self.warn_unscaled_loss = False + + return scaled_loss + + def forward(self, *inputs, **kwargs): + r"""Execute forward propagation + + Arguments: + *inputs: Variable length input list + **kwargs: variable length keyword arguments + """ + loss = self.module(*inputs, **kwargs) + return loss + + def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): + #Zero stage 2 communicates during non gradient accumulation boundaries as well + if self.zero_optimization_partition_gradients(): + self.optimizer.overlapping_partition_gradients_reduce_epilogue() + + #Communicate only at gradient accumulation boundaries + elif self.is_gradient_accumulation_boundary(): + if self.zero_optimization_stage() == ZERO_OPTIMIZATION_OPTIMIZER_STATES: + assert self.zero_reduce_scatter() + self.optimizer.reduce_scatter_gradients( + postscale_gradients=self.postscale_gradients(), + gradient_predivide_factor=self.gradient_predivide_factor(), + gradient_average=self.gradient_average) + else: + self.buffered_allreduce_fallback(elements_per_buffer=bucket_size) + + def backward(self, loss, allreduce_gradients=True, release_loss=False): + r"""Execute backward pass on the loss + + Arguments: + loss: Torch tensor on which to execute backward propagation + allreduce_gradients: If this is False, then gradient averaging will be skipped. Default is True. + """ + + # scale loss w.r.t. gradient accumulation if needed + if self.gradient_accumulation_steps() > 1: + loss = self._scale_loss(loss.float()) + + assert self.optimizer is not None, "must provide optimizer during " \ + "init in order to use backward" + + if self.zero_optimization(): + self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary( + ) + self.optimizer.backward(loss) + elif self.amp_enabled(): + # AMP requires delaying unscale when inside gradient accumulation boundaries + # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations + delay_unscale = not self.is_gradient_accumulation_boundary() + with amp.scale_loss(loss, + self.optimizer, + delay_unscale=delay_unscale) as scaled_loss: + scaled_loss.backward() + elif self.fp16_enabled(): + self.optimizer.backward(loss) + else: + loss.backward() + + if allreduce_gradients and self.enable_backward_allreduce: + self.allreduce_gradients() + + if release_loss: + # loss.data = None + pass + + return loss + + def is_gradient_accumulation_boundary(self): + """Query whether the current micro-batch is at the boundary of + gradient accumulation, and thus will trigger gradient reductions and + an optimizer step. + + Returns: + bool: if the current step is a gradient accumulation boundary. + """ + return (self.micro_steps + 1) % \ + self.gradient_accumulation_steps() == 0 + + def zero_grad(self): + """ + Zero parameter grads. + """ + for param_name, param in self.module.named_parameters(): + param.grad = None + + def clip_fp32_gradients(self): + torch.nn.utils.clip_grad_norm_(parameters=self.module.parameters(), + max_norm=self.gradient_clipping()) + + def _take_model_step(self): + if self.gradient_clipping() > 0.0: + if not self.fp16_enabled() and not self.amp_enabled(): + self.clip_fp32_gradients() + elif self.amp_enabled(): + # AMP's recommended way of doing clipping + # https://nvidia.github.io/apex/advanced.html#gradient-clipping + master_params = amp.master_params(self.optimizer) + torch.nn.utils.clip_grad_norm_(parameters=master_params, + max_norm=self.gradient_clipping()) + self.optimizer.step() + + #zero grad in basic optimizer could be unreliable and may not exhibit + #the behaviour that we want + if not self.zero_optimization() and not self.fp16_enabled( + ) and not self.amp_enabled(): + self.zero_grad() + else: + self.optimizer.zero_grad() + + report_progress = self.global_rank == 0 if self.global_rank else True + + # Check overlow here since in DS fp16 optimizer, the overflow is updated in above step() function. + overflow = False + if hasattr(self.optimizer, 'overflow'): + overflow = self.optimizer.overflow + + if overflow: + self.skipped_steps += 1 + else: + if self.lr_scheduler is not None: + self.lr_scheduler.step() + if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0: + self._report_progress(self.global_steps + 1) + + self.global_steps += 1 + self.global_samples += self.train_batch_size() + + def step(self): + r"""Execute the weight update step after forward and backward propagation + on effective_train_batch. + """ + + assert self.optimizer is not None, "must provide optimizer during " \ + "init in order to use step" + report_progress = self.global_rank == 0 if self.global_rank else True + + # Update the model when we reach gradient accumulation boundaries + if self.is_gradient_accumulation_boundary(): + self._take_model_step() + + self.micro_steps += 1 + + def _get_optimizer_param(self, param_name): + result = [] + if not self.optimizer: + return result + for group in self.optimizer.param_groups: + if param_name in group: + result.append(group[param_name]) + else: + result.append(0.0) + return result + + def get_lr(self): + return self._get_optimizer_param('lr') + + def get_type(self): + return self._get_optimizer_param('type') + + def get_mom(self): + return self._get_optimizer_param('betas') + + def _report_progress(self, step): + lr = self.get_lr() + mom = self.get_mom() + log_dist(f'step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}', + ranks=[0]) + + def allreduce_bucket(self, bucket): + tensor = flatten(bucket) + + tensor_to_allreduce = tensor + + if self.allreduce_always_fp32(): + tensor_to_allreduce = tensor.float() + + if self.postscale_gradients(): + if self.gradient_predivide_factor() != 1.0: + tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor()) + + dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group) + + if self.gradient_average: + if self.gradient_predivide_factor() != self.dp_world_size: + tensor_to_allreduce.mul_(self.gradient_predivide_factor() / + self.dp_world_size) + else: + tensor_to_allreduce.div_(self.dp_world_size) + dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group) + + if self.allreduce_always_fp32() and tensor is not tensor_to_allreduce: + tensor.copy_(tensor_to_allreduce) + + return tensor + + def allreduce_and_copy(self, small_bucket): + allreduced = self.allreduce_bucket(small_bucket) + for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)): + buf.copy_(synced) + + def allreduce_no_retain(self, bucket, numel_per_bucket=500000000): + small_bucket = [] + numel = 0 + for tensor in bucket: + small_bucket.append(tensor) + numel = numel + tensor.numel() + if numel > numel_per_bucket: + self.allreduce_and_copy(small_bucket) + small_bucket = [] + numel = 0 + if len(small_bucket) > 0: + self.allreduce_and_copy(small_bucket) + + def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000): + grads = [] + for param_name, param in self.module.named_parameters(): + if param.grad is None: + # In cases where there is an imbalance of empty grads across + # ranks we must create empty grads, this will ensure that every + # rank is reducing the same size. In some cases it may make + # sense in the future to support the ability to average not + # w.r.t. world size but with a different value. + param.grad = torch.zeros(param.size(), + dtype=param.dtype, + device=param.device) + grads.append(param.grad.data) + else: + grad_data = param.grad.data + if self.sparse_gradients_enabled( + ) and param_name in self.csr_tensor_module_names: + grads.append(CSRTensor(grad_data)) + else: + grads.append(grad_data) + + split_buckets = split_half_float_double_csr(grads) + + for i, bucket_tuple in enumerate(split_buckets): + bucket_type, bucket = bucket_tuple + if bucket_type == CSRTensor.type(): + self.csr_allreduce_no_retain(bucket) + else: + self.allreduce_no_retain(bucket, numel_per_bucket=elements_per_buffer) + + def csr_allreduce_no_retain(self, bucket): + allreduced_csrs = self.csr_allreduce_bucket(bucket) + # Densify csr tensor and copy back to original location + for csr in allreduced_csrs: + dense_tensor = csr.to_dense() + csr.orig_dense_tensor.copy_(dense_tensor) + + def csr_allreduce_bucket(self, bucket): + csr_list = [] + for csr in bucket: + csr_list.append(self.csr_allreduce(csr)) + return csr_list + + def csr_allreduce(self, csr): + # Pre-divide for fp16 stability + csr.values.div_(self.dp_world_size) + + indices_device_list = self.csr_all_gather(csr.indices) + values_device_list = self.csr_all_gather(csr.values) + + csr.indices = torch.cat(indices_device_list) + csr.values = torch.cat(values_device_list) + return csr + + def csr_all_gather(self, value): + my_size = torch.LongTensor([value.size()[0]]).to(self.device) + all_sizes = self.all_gather_scalar(my_size) + max_size = torch.cat(all_sizes).max() + fill_size = (max_size - my_size) + + assert value.dim() in [1, 2] + if value.dim() == 1: + if fill_size > 0: + value = torch.cat([value, value.new_zeros(fill_size)]) + tensor_list = [value.new_zeros(max_size) for _ in range(self.dp_world_size)] + else: + if fill_size > 0: + value = torch.cat([value, value.new_zeros(fill_size, value.size()[1])]) + tensor_list = [ + value.new_zeros(max_size, + value.size()[1]) for _ in range(self.dp_world_size) + ] + + dist.all_gather(tensor_list, value, group=self.data_parallel_group) + tensors = [] + for dev_idx, t in enumerate(tensor_list): + size = all_sizes[dev_idx][0] + tensors.append( + t.index_select(0, + torch.LongTensor(range(size)).to(self.device))) + + return tensors + + def all_gather_scalar(self, value): + tensor_list = [value.new_zeros(value.size()) for _ in range(self.dp_world_size)] + dist.all_gather(tensor_list, value, group=self.data_parallel_group) + return tensor_list + + def module_state_dict(self, destination=None, prefix='', keep_vars=False): + sd = self.module.state_dict(destination, prefix, keep_vars) + return sd + + def load_module_state_dict(self, state_dict, strict=True): + self.module.load_state_dict(state_dict, strict=strict) + + def _get_rank_zero_ckpt_name(self, checkpoints_path, tag, mp_rank, dp_rank): + filename = 'zero_pp_rank_{}'.format(dp_rank) + zero_ckpt_name = os.path.join( + checkpoints_path, + str(tag), + filename + '_mp_rank_{:02d}'.format(mp_rank) + 'optim_states.pt') + return zero_ckpt_name + + def _get_zero_ckpt_name(self, checkpoints_path, tag): + mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() + pp_rank = torch.distributed.get_rank(group=self.optimizer.dp_process_group) + return self._get_rank_zero_ckpt_name(checkpoints_path, tag, mp_rank, pp_rank) + + def _get_ckpt_name(self, checkpoints_path, tag): + mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() + ckpt_name = os.path.join(checkpoints_path, + str(tag), + 'mp_rank_{:02d}'.format(mp_rank) + '_model_states.pt') + return ckpt_name + + def load_checkpoint(self, + load_dir, + tag, + load_module_strict=True, + load_optimizer_states=True, + load_lr_scheduler_states=True): + r"""Load training checkpoint + + Arguments: + load_dir: Required. Directory to load the checkpoint from + tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step. + load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and checkpoint match. + load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint. Ex. ADAM's momentum and variance + load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint. + Return: + load_path: Path of the loaded checkpoint. None if loading the checkpoint failed + client_state: State dictionary used for loading required training states in the client code. + """ + + load_path, client_states = self._load_checkpoint(load_dir, + tag, + load_module_strict=load_module_strict, + load_optimizer_states=load_optimizer_states, + load_lr_scheduler_states=load_lr_scheduler_states) + + if self.zero_optimization() and load_path is not None: + self._load_zero_checkpoint(load_dir, + tag, + load_optimizer_states=load_optimizer_states) + + return load_path, client_states + + def _load_checkpoint(self, + load_dir, + tag, + load_module_strict=True, + load_optimizer_states=True, + load_lr_scheduler_states=True): + + load_path = self._get_ckpt_name(load_dir, tag) + + if not os.path.exists(load_path): + logger.warn( + 'Client provided checkpoint load path: {} does not exist ... skip checkpoint load' + .format(load_path)) + return None, None + + logger.info(f'rank: {self.global_rank} loading checkpoint: {load_path}') + checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage) + + self.load_module_state_dict(state_dict=checkpoint['module'], + strict=load_module_strict) + if not self.zero_optimization(): + if self.fp16_enabled(): + self.optimizer.load_state_dict( + checkpoint['optimizer'], + load_optimizer_states=load_optimizer_states) + elif load_optimizer_states: + self.optimizer.load_state_dict(checkpoint['optimizer']) + + if load_lr_scheduler_states and self.lr_scheduler is not None: + self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + + self.csr_tensor_module_names = checkpoint['csr_tensor_module_names'] + self.global_steps = checkpoint['global_steps'] + self.global_samples = checkpoint.get('global_samples', + self.global_steps * self.train_batch_size()) + self.skipped_steps = checkpoint['skipped_steps'] + self.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size'] + self.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size'] + deepspeed_states = [ + 'module', + 'optimizer', + 'lr_scheduler', + 'csr_tensor_module_names', + 'skipped_steps', + 'global_steps', + 'dp_world_size', + 'mp_world_size' + ] + client_state = { + key: value + for key, + value in checkpoint.items() if not key in deepspeed_states + } + + return load_path, client_state + + def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True): + zero_sd_list = self._get_all_zero_checkpoints(load_dir, tag) + if zero_sd_list is None: + return + + self.optimizer.load_state_dict( + state_dict_list=zero_sd_list, + load_optimizer_states=load_optimizer_states, + load_from_fp32_weights=self.zero_load_from_fp32_weights()) + print( + f'loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}' + ) + + def _get_mp_rank_zero_checkpoint_names(self, load_dir, tag, mp_rank, dp_world_size): + zero_ckpt_names = [] + for dp_rank in range(dp_world_size): + ckpt_name = self._get_rank_zero_ckpt_name(checkpoints_path=load_dir, + tag=tag, + mp_rank=mp_rank, + dp_rank=dp_rank) + zero_ckpt_names.append(ckpt_name) + + return zero_ckpt_names + + def _get_all_zero_checkpoint_names(self, + load_dir, + tag, + mp_world_size, + dp_world_size): + zero_ckpt_names = [] + for mp_rank in range(mp_world_size): + mp_rank_ckpt_names = self._get_mp_rank_zero_checkpoint_names( + load_dir=load_dir, + tag=tag, + mp_rank=mp_rank, + dp_world_size=dp_world_size) + zero_ckpt_names += mp_rank_ckpt_names + + return zero_ckpt_names + + def _get_all_zero_checkpoints(self, load_dir, tag): + mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() + zero_ckpt_names = self._get_mp_rank_zero_checkpoint_names( + load_dir=load_dir, + tag=tag, + mp_rank=mp_rank, + dp_world_size=self.loaded_checkpoint_dp_world_size) + invalid_zero_ckpt_paths = [] + for ckpt_name in zero_ckpt_names: + if not os.path.exists(ckpt_name): + invalid_zero_ckpt_paths.append(ckpt_name) + + if len(invalid_zero_ckpt_paths) > 0: + logger.warn( + f"Client provided zero checkpoint load paths: {invalid_zero_ckpt_paths} does not exist" + ) + return None + + zero_sd_list = [] + for ckpt_name in zero_ckpt_names: + zero_sd_list.append(torch.load(ckpt_name, map_location='cpu')) + + zero_optimizer_sd = [sd['optimizer_state_dict'] for sd in zero_sd_list] + print( + f"successfully loaded {len(zero_optimizer_sd)} ZeRO state_dicts for rank {self.global_rank}" + ) + return zero_optimizer_sd + + def save_checkpoint(self, save_dir, tag, client_state={}): + r"""Save training checkpoint + + Arguments: + save_dir: Required. Directory for saving the checkpoint + tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step. + client_state: Optional. State dictionary used for saving required training states in the client code. + """ + + # This is to make sure the checkpoint names are created without collision + # There seems to be issue creating them in parallel + + if self.save_non_zero_checkpoint: + self._create_checkpoint_file(save_dir, tag, False) + self._save_checkpoint(save_dir, tag, client_state=client_state) + + if self.save_zero_checkpoint: + self._create_zero_checkpoint_files(save_dir, tag) + self._save_zero_checkpoint(save_dir, tag) + + return True \ No newline at end of file diff --git a/allennlp/training/deepspeed_trainer.py b/allennlp/training/deepspeed_trainer.py index ecd9998719e..13fc099b2df 100644 --- a/allennlp/training/deepspeed_trainer.py +++ b/allennlp/training/deepspeed_trainer.py @@ -8,8 +8,6 @@ import os import re import math -import json -import tempfile import time import traceback from copy import deepcopy @@ -24,6 +22,8 @@ from torch.nn.parallel import DistributedDataParallel import deepspeed +# from deepspeed.runtime.engine import DeepSpeedEngine +from allennlp.training.deepspeed_engine_adapter import AllennlpDeepSpeedEngineAdapter as DeepSpeedEngine from allennlp.common import Lazy, Registrable, Tqdm, Params, FromParams from allennlp.common import util as common_util @@ -44,6 +44,11 @@ JsonDict = Dict[str, Any] +# import torch.autograd.profiler as profiler +# import sys; sys.tracebacklimit = 0 +# from pyinstrument import Profiler +# profiler = Profiler() + class DeepspeedConfig(FromParams): def __init__( self, @@ -61,46 +66,39 @@ def __init__( self.zero_allow_untested_optimizer = zero_allow_untested_optimizer self.wall_clock_breakdown = wall_clock_breakdown - @staticmethod - def build_deepspeed_args(local_rank: int = 0): - from argparse import Namespace - - args = dict(deepspeed_config=deepspeed_config_path, deepspeed=True, local_rank=local_rank) - return Namespace(**args) - - @property - def config(self): - # return { + # self.config = { # 'fp16': self.fp16, # 'amp': self.amp, # 'zero_optimization': self.zero_optimization, # 'zero_allow_untested_optimizer': self.zero_allow_untested_optimizer # } - return vars(self) def launch( self, model: torch.nn.Module, - optimizer: Union[str, torch.optim.Optimizer], local_rank: int, - serialization_dir: str, batch_size: int, gradient_accumulation_steps: int, **kwargs ): - path = '' - config = dict(**self.config, train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) - ds = deepspeed.initialize( - args=self.build_deepspeed_args(path, local_rank), + from argparse import Namespace + + args = Namespace(deepspeed_config=None, deepspeed=True, local_rank=local_rank) + config = dict(**vars(self), train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) + + ds = DeepSpeedEngine( + args=args, model=model, + # optimizer=optimizer, + # optimizer=Optimizer.default(model.named_parameters()), model_parameters=model.parameters(), + # training_data=training_data, + # lr_scheduler=lr_scheduler, + # mpu=mpu, dist_init_required=False, - config_params=config, - **kwargs + config_params=config ) - - # os.remove(path) - return ds + return ds, ds.optimizer @Trainer.register("deepspeed", constructor="from_partial_objects") class DeepspeedTrainer(Trainer): @@ -179,11 +177,9 @@ def __init__( self._pytorch_model = self.model self._ds_config = deepspeed_config - self.model_engine, self.optimizer, _, _ = self._ds_config.launch( + self.model_engine, self.optimizer, *_ = self._ds_config.launch( self.model, - None, local_rank, - serialization_dir, self.data_loader.batch_size, num_gradient_accumulation_steps ) @@ -200,7 +196,12 @@ def batch_outputs(self, batch: TensorDict, for_training: bool) -> Dict[str, torc """ # batch = nn_util.move_to_device(batch, self.cuda_device) batch = nn_util.move_to_device(batch, self.model_engine.device) + # with profiler.profile(use_cuda=True, profile_memory=True) as prof: + #with profiler.record_function("forward"): + # with Profiler() as profiler: output_dict = self.model_engine(**batch) + # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) + # print(profiler.output_text(unicode=True, color=True, show_all=True, timeline=True)) if for_training: try: @@ -750,13 +751,6 @@ def from_partial_objects( common_util.log_frozen_and_tunable_parameter_names(model) - batches_per_epoch: Optional[int] - try: - batches_per_epoch = len(data_loader) - batches_per_epoch = math.ceil(batches_per_epoch / num_gradient_accumulation_steps) - except TypeError: - batches_per_epoch = None - parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] moving_average_ = moving_average.construct(parameters=parameters) From f0da3bf0624e84ece5e312e99852a02bec90f366 Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Sat, 10 Oct 2020 18:24:25 -0400 Subject: [PATCH 05/20] 1416 LOC -> 562 --- allennlp/training/__init__.py | 8 +- allennlp/training/deepspeed/__init__.py | 1 + .../{ => deepspeed}/deepspeed_trainer.py | 37 +- allennlp/training/deepspeed/engine_adapter.py | 562 ++++++++ .../training/deepspeed/optimizers/__init__.py | 0 .../training/deepspeed/optimizers/basic.py | 51 + .../training/deepspeed/optimizers/fp16.py | 53 + .../deepspeed/optimizers/zero_optimization.py | 127 ++ allennlp/training/deepspeed_engine_adapter.py | 1130 ----------------- 9 files changed, 817 insertions(+), 1152 deletions(-) create mode 100644 allennlp/training/deepspeed/__init__.py rename allennlp/training/{ => deepspeed}/deepspeed_trainer.py (97%) create mode 100644 allennlp/training/deepspeed/engine_adapter.py create mode 100644 allennlp/training/deepspeed/optimizers/__init__.py create mode 100644 allennlp/training/deepspeed/optimizers/basic.py create mode 100644 allennlp/training/deepspeed/optimizers/fp16.py create mode 100644 allennlp/training/deepspeed/optimizers/zero_optimization.py delete mode 100644 allennlp/training/deepspeed_engine_adapter.py diff --git a/allennlp/training/__init__.py b/allennlp/training/__init__.py index 9cb0154bea0..4ee8019687b 100644 --- a/allennlp/training/__init__.py +++ b/allennlp/training/__init__.py @@ -1,7 +1,6 @@ from allennlp.training.checkpointer import Checkpointer from allennlp.training.tensorboard_writer import TensorboardWriter from allennlp.training.no_op_trainer import NoOpTrainer -from allennlp.training.deepspeed_trainer import DeepspeedTrainer from allennlp.training.trainer import ( Trainer, GradientDescentTrainer, @@ -9,3 +8,10 @@ EpochCallback, TrackEpochCallback, ) +from allennlp.training.deepspeed import DeepspeedTrainer + +# import warnings +# try: +# from allennlp.training.deepspeed import DeepspeedTrainer +# except ImportError: +# warnings.warn('Deepspeed plugin not installed. Ignoring.') \ No newline at end of file diff --git a/allennlp/training/deepspeed/__init__.py b/allennlp/training/deepspeed/__init__.py new file mode 100644 index 00000000000..682238521aa --- /dev/null +++ b/allennlp/training/deepspeed/__init__.py @@ -0,0 +1 @@ +from allennlp.training.deepspeed.deepspeed_trainer import DeepspeedTrainer \ No newline at end of file diff --git a/allennlp/training/deepspeed_trainer.py b/allennlp/training/deepspeed/deepspeed_trainer.py similarity index 97% rename from allennlp/training/deepspeed_trainer.py rename to allennlp/training/deepspeed/deepspeed_trainer.py index 13fc099b2df..33e70eeb1b5 100644 --- a/allennlp/training/deepspeed_trainer.py +++ b/allennlp/training/deepspeed/deepspeed_trainer.py @@ -23,7 +23,6 @@ import deepspeed # from deepspeed.runtime.engine import DeepSpeedEngine -from allennlp.training.deepspeed_engine_adapter import AllennlpDeepSpeedEngineAdapter as DeepSpeedEngine from allennlp.common import Lazy, Registrable, Tqdm, Params, FromParams from allennlp.common import util as common_util @@ -40,22 +39,21 @@ from allennlp.training.tensorboard_writer import TensorboardWriter from allennlp.training.trainer import Trainer, BatchCallback, EpochCallback +from allennlp.training.deepspeed.engine_adapter import AllennlpDeepSpeedEngineAdapter as DeepSpeedEngine +from allennlp.training.deepspeed.optimizers.zero_optimization import ZeroOptimizer + logger = logging.getLogger(__name__) JsonDict = Dict[str, Any] -# import torch.autograd.profiler as profiler -# import sys; sys.tracebacklimit = 0 -# from pyinstrument import Profiler -# profiler = Profiler() - class DeepspeedConfig(FromParams): def __init__( self, - optimizer: JsonDict, + optimizer: Lazy[Optimizer], # JsonDict, fp16: JsonDict = {'enabled': False}, amp: JsonDict = {'enabled': False}, zero_optimization: Union[bool, Dict] = False, + zero_optimizer: Lazy[ZeroOptimizer] = None, zero_allow_untested_optimizer: bool = True, wall_clock_breakdown: bool = False ): @@ -65,13 +63,7 @@ def __init__( self.zero_optimization = zero_optimization self.zero_allow_untested_optimizer = zero_allow_untested_optimizer self.wall_clock_breakdown = wall_clock_breakdown - - # self.config = { - # 'fp16': self.fp16, - # 'amp': self.amp, - # 'zero_optimization': self.zero_optimization, - # 'zero_allow_untested_optimizer': self.zero_allow_untested_optimizer - # } + self._zero_optim = zero_optimizer def launch( self, @@ -82,19 +74,22 @@ def launch( **kwargs ): from argparse import Namespace - args = Namespace(deepspeed_config=None, deepspeed=True, local_rank=local_rank) - config = dict(**vars(self), train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) + optimizer = self.optimizer.construct(model_parameters=model.parameters()) + del self.optimizer + + zero_optim = self._zero_optim + del self._zero_optim + # del self.zero_optimization + + config = dict(**vars(self), train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) ds = DeepSpeedEngine( args=args, model=model, - # optimizer=optimizer, - # optimizer=Optimizer.default(model.named_parameters()), + optimizer=optimizer, + zero_optimizer=zero_optim, model_parameters=model.parameters(), - # training_data=training_data, - # lr_scheduler=lr_scheduler, - # mpu=mpu, dist_init_required=False, config_params=config ) diff --git a/allennlp/training/deepspeed/engine_adapter.py b/allennlp/training/deepspeed/engine_adapter.py new file mode 100644 index 00000000000..ec2b4ecf7e8 --- /dev/null +++ b/allennlp/training/deepspeed/engine_adapter.py @@ -0,0 +1,562 @@ +''' +Copyright 2019 The Microsoft DeepSpeed Team +''' + +import os +import torch +import warnings +import torch.distributed as dist + +import apex +from apex.optimizers import ( + FusedAdam, + FusedLAMB +) +from torch import nn +from torch.distributed.distributed_c10d import _get_global_rank + +from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer +from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1 +from deepspeed.runtime.zero.utils import is_zero_supported_optimizer +from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer +from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer +from deepspeed.runtime.fp16.onebit_adam import OnebitAdam +from deepspeed.runtime.config import DeepSpeedConfig, \ + ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, DEEPSPEED_ADAM, DEEPSPEED_OPTIMIZERS +from deepspeed.runtime.dataloader import DeepSpeedDataLoader +from deepspeed.runtime.constants import \ + ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \ + TORCH_DISTRIBUTED_DEFAULT_PORT +from deepspeed.runtime.zero.constants import \ + ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS +from deepspeed.runtime.csr_tensor import CSRTensor +import deepspeed.runtime.lr_schedules as lr_schedules +from deepspeed.utils import logger, log_dist +from deepspeed.runtime.engine import ( + _initialize_parameter_parallel_groups, + split_half_float_double_csr, + flatten, + unflatten, + MEMORY_OPT_ALLREDUCE_SIZE +) + +from allennlp.common import Lazy, FromParams +from allennlp.training.deepspeed.optimizers.zero_optimization import ZeroOptimizer +# from allennlp.training.deepspeed.optimizers.fp16 import DeepspeedFP16Optimizer +from allennlp.training.deepspeed.optimizers.basic import * + + +class DummyTimer: + class Timer: + def __init__(self, name): + pass + + def start(self): + pass + + def stop(self): + pass + + def reset(self): + pass + + def __init__(self): + self.timers = {} + + def __call__(self, name): + if name not in self.timers: + self.timers[name] = self.Timer(name) + return self.timers[name] + + def log(self, *args, **kwargs): + pass + + + + +class AllennlpDeepSpeedEngineAdapter(FromParams, nn.Module): + r"""DeepSpeed engine for training. + """ + def __init__(self, + args, + model, + optimizer=None, + zero_optimizer: Lazy[ZeroOptimizer] = None, + model_parameters=None, + lr_scheduler=None, + mpu=None, + dist_init_required=None, + config_params=None + ): + super().__init__() + self.zero_optimizer = zero_optimizer + + self.client_optimizer = optimizer + self.client_model_parameters = model_parameters + self.client_lr_scheduler = lr_scheduler + self.mpu = mpu + self.data_parallel_group = None + self.micro_steps = 0 + self.skipped_steps = 0 + self.gradient_average = True + self.warn_unscaled_loss = True + self.config_params = config_params + self.enable_backward_allreduce = True + + if dist_init_required is None: + dist_init_required = not dist.is_initialized() + + self.dist_backend = "nccl" + if dist_init_required: + if not dist.is_initialized(): + logger.info("Initializing torch distributed with backend: {}".format( + self.dist_backend)) + dist.init_process_group(backend=self.dist_backend) + else: + logger.warning( + "Was given dist_init_required=True but detected that torch" + "distributed was already initialized, cannot initialize twice.") + + self._configure_with_arguments(args, mpu) + + self._init_distributed(dist_init_required) + + # Configure distributed model + self._configure_distributed_model(model) + + # Configure optimizer and scheduler + self.optimizer = self._configure_optimizer(optimizer, model_parameters) + self._configure_lr_scheduler(lr_scheduler) + + # Bookkeeping for csr support + self.csr_tensor_module_names = set() + if self.sparse_gradients_enabled: + for name, module in self.module.named_modules(): + if isinstance(module, torch.nn.Embedding): + self.csr_tensor_module_names.add(name + ".weight") + + @property + def dynamic_loss_scale(self): + return self.loss_scale == 0 + + def _configure_lr_scheduler(self, client_lr_scheduler): + # First check for scheduler in json configuration + lr_scheduler = self._scheduler_from_config(self.optimizer) + if lr_scheduler: + self.lr_scheduler = lr_scheduler + else: + self.lr_scheduler = client_lr_scheduler + + def _scheduler_from_config(self, optimizer): + scheduler_name = self.scheduler_name + if scheduler_name is not None: + if hasattr(lr_schedules, scheduler_name): + scheduler = getattr(lr_schedules, scheduler_name) + else: + assert hasattr(torch.optim.lr_scheduler, scheduler_name), \ + f"DeepSpeed does not recognize LR scheduler {scheduler_name}" + + scheduler = getattr(torch.optim.lr_scheduler, scheduler_name) + + instantiated_scheduler = scheduler(optimizer, **self.scheduler_params) + return instantiated_scheduler + else: + return None + + def _init_distributed(self, dist_init_required): + if self.local_rank >= 0: + torch.cuda.set_device(self.local_rank) + self.device = torch.device("cuda", self.local_rank) + self.world_size = dist.get_world_size() + self.global_rank = dist.get_rank() + else: + self.world_size = 1 + self.global_rank = 0 + self.device = torch.device("cuda") + + # Configure based on command line arguments + def _configure_with_arguments(self, args, mpu): + self.local_rank = args.local_rank if hasattr(args, 'local_rank') else 0 + self._config = DeepSpeedConfig(args.deepspeed_config, + mpu, + param_dict=self.config_params) + for k, v in vars(self._config).items(): + setattr(self, k, v) + + + def _is_supported_optimizer(self, optimizer_name): + return optimizer_name in DEEPSPEED_OPTIMIZERS or \ + getattr(torch.optim, optimizer_name, None) is not None + + def _broadcast_model(self): + for p in self.module.parameters(): + if torch.is_tensor(p): + dist.broadcast(p, + self.broadcast_src_rank, + group=self.data_parallel_group) + + def _configure_distributed_model(self, model): + self.module = model + if self.fp16_enabled: + self.module.half() + self.module.to(self.device) + + if self.mpu is None: + self.data_parallel_group = _initialize_parameter_parallel_groups() + self.dp_world_size = dist.get_world_size() + self.mp_world_size = 1 + self.broadcast_src_rank = 0 + else: + self.data_parallel_group = self.mpu.get_data_parallel_group() + self.dp_world_size = self.mpu.get_data_parallel_world_size() + self.mp_world_size = self.mpu.get_model_parallel_world_size() + self.broadcast_src_rank = _get_global_rank( + self.mpu.get_data_parallel_group(), + 0 + ) + + self._broadcast_model() + + def _configure_optimizer(self, client_optimizer, model_parameters): + basic_optimizer = client_optimizer + + if self.zero_enabled: #zero_optimization: # self.zero_optimizer or + if not is_zero_supported_optimizer(basic_optimizer): + assert self.zero_allow_untested_optimizer, \ + 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.' + + if self.global_rank == 0: + logger.warning("**** You are using ZeRO with an untested optimizer, proceed with caution *****") + + return self._configure_zero_optimizer(basic_optimizer) + + if self.fp16_enabled: + return self._configure_fp16_optimizer(basic_optimizer) + + return basic_optimizer + + + def _configure_fp16_optimizer(self, optimizer): + defaults = dict( + init_optimizer=optimizer, + mpu=self.mpu, + clip_grad=self.gradient_clipping, + fused_adam_legacy=self.optimizer_legacy_fusion, + timers=None, + verbose=False + ) + + if not self.dynamic_loss_scale(): + return FP16_Optimizer(**defaults, static_loss_scale=self.loss_scale) + + defaults.update(dict( + dynamic_loss_scale=True, + dynamic_loss_args=self.dynamic_loss_scale_args, + )) + + if isinstance(optimizer, (FusedAdam, OnebitAdam)): + extras = dict(initial_dynamic_scale=self.initial_dynamic_scale) + else: + extras = dict(fused_lamb_legacy=isinstance(optimizer, FusedLAMB)) + optimizer = FP16_Optimizer(**defaults, **extras) + return optimizer + + def _configure_zero_optimizer(self, optimizer): + optimizer = self.zero_optimizer.construct( + init_optimizer=optimizer, + dp_process_group=self.data_parallel_group, + mpu=self.mpu + ) + assert not (isinstance(optimizer, FP16_DeepSpeedZeroOptimizer_Stage1) and not self.zero_reduce_scatter), 'Stage 1 only supports reduce scatter mode' + return optimizer + + def train(self): + r""" + """ + + self.warn_unscaled_loss = True + self.module.train() + + def eval(self): + r""" + """ + + self.warn_unscaled_loss = True + self.module.train(False) + + def _scale_loss(self, prescaled_loss): + if isinstance(prescaled_loss, torch.Tensor): + scaled_loss = prescaled_loss / self.gradient_accumulation_steps + elif isinstance(prescaled_loss, tuple) or isinstance(prescaled_loss, list): + scaled_loss = [] + for l in prescaled_loss: + if isinstance(l, torch.Tensor): + scaled_loss.append(l / self.gradient_accumulation_steps) + else: + scaled_loss.append(l) + else: + scaled_loss = prescaled_loss + if self.warn_unscaled_loss: + logger.warning( + f'DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}' + ) + self.warn_unscaled_loss = False + + return scaled_loss + + def forward(self, *inputs, **kwargs): + r"""Execute forward propagation + + Arguments: + *inputs: Variable length input list + **kwargs: variable length keyword arguments + """ + loss = self.module(*inputs, **kwargs) + return loss + + def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): + #Zero stage 2 communicates during non gradient accumulation boundaries as well + if self.zero_optimization_stage >= ZERO_OPTIMIZATION_GRADIENTS: + self.optimizer.overlapping_partition_gradients_reduce_epilogue() + + #Communicate only at gradient accumulation boundaries + elif self.is_gradient_accumulation_boundary: + if self.zero_optimization_stage == ZERO_OPTIMIZATION_OPTIMIZER_STATES: + assert self.zero_reduce_scatter + self.optimizer.reduce_scatter_gradients( + postscale_gradients=self.postscale_gradients, + gradient_predivide_factor=self.gradient_predivide_factor, + gradient_average=self.gradient_average) + else: + self.buffered_allreduce_fallback(elements_per_buffer=bucket_size) + + def backward(self, loss, allreduce_gradients=True, release_loss=False): + r"""Execute backward pass on the loss + + Arguments: + loss: Torch tensor on which to execute backward propagation + allreduce_gradients: If this is False, then gradient averaging will be skipped. Default is True. + """ + + # scale loss w.r.t. gradient accumulation if needed + if self.gradient_accumulation_steps > 1: + loss = self._scale_loss(loss.float()) + + assert self.optimizer is not None, "must provide optimizer during " \ + "init in order to use backward" + + if self.zero_enabled: #zero_optimization: + self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary + self.optimizer.backward(loss) + elif self.fp16_enabled: + self.optimizer.backward(loss) + else: + loss.backward() + + if allreduce_gradients and self.enable_backward_allreduce: + self.allreduce_gradients() + + return loss + + @property + def is_gradient_accumulation_boundary(self): + """Query whether the current micro-batch is at the boundary of + gradient accumulation, and thus will trigger gradient reductions and + an optimizer step. + + Returns: + bool: if the current step is a gradient accumulation boundary. + """ + return (self.micro_steps + 1) % \ + self.gradient_accumulation_steps == 0 + + def zero_grad(self): + """ + Zero parameter grads. + """ + for param_name, param in self.module.named_parameters(): + param.grad = None + + def clip_fp32_gradients(self): + torch.nn.utils.clip_grad_norm_(parameters=self.module.parameters(), + max_norm=self.gradient_clipping) + + def _take_model_step(self): + if self.gradient_clipping > 0.0 and not self.fp16_enabled: + self.clip_fp32_gradients() + self.optimizer.step() + + #zero grad in basic optimizer could be unreliable and may not exhibit + #the behaviour that we want + if not self.zero_enabled and not self.fp16_enabled: + self.zero_grad() + else: + self.optimizer.zero_grad() + + # Check overlow here since in DS fp16 optimizer, the overflow is updated in above step() function. + overflow = False + if hasattr(self.optimizer, 'overflow'): + overflow = self.optimizer.overflow + + if overflow: + self.skipped_steps += 1 + else: + if self.lr_scheduler is not None: + self.lr_scheduler.step() + + def step(self): + r"""Execute the weight update step after forward and backward propagation + on effective_train_batch. + """ + + assert self.optimizer is not None, "must provide optimizer during " \ + "init in order to use step" + + # Update the model when we reach gradient accumulation boundaries + if self.is_gradient_accumulation_boundary: + self._take_model_step() + + self.micro_steps += 1 + + def _get_optimizer_param(self, param_name): + result = [] + if not self.optimizer: + return result + for group in self.optimizer.param_groups: + if param_name in group: + result.append(group[param_name]) + else: + result.append(0.0) + return result + + + def allreduce_bucket(self, bucket): + tensor = flatten(bucket) + + tensor_to_allreduce = tensor + + if self.allreduce_always_fp32: + tensor_to_allreduce = tensor.float() + + if self.postscale_gradients: + if self.gradient_predivide_factor != 1.0: + tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor) + + dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group) + + if self.gradient_average: + if self.gradient_predivide_factor != self.dp_world_size: + tensor_to_allreduce.mul_(self.gradient_predivide_factor / self.dp_world_size) + else: + tensor_to_allreduce.div_(self.dp_world_size) + dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group) + + if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce: + tensor.copy_(tensor_to_allreduce) + + return tensor + + def allreduce_and_copy(self, small_bucket): + allreduced = self.allreduce_bucket(small_bucket) + for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)): + buf.copy_(synced) + + def allreduce_no_retain(self, bucket, numel_per_bucket=500000000): + small_bucket = [] + numel = 0 + for tensor in bucket: + small_bucket.append(tensor) + numel = numel + tensor.numel() + if numel > numel_per_bucket: + self.allreduce_and_copy(small_bucket) + small_bucket = [] + numel = 0 + if len(small_bucket) > 0: + self.allreduce_and_copy(small_bucket) + + def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000): + grads = [] + for param_name, param in self.module.named_parameters(): + if param.grad is None: + # In cases where there is an imbalance of empty grads across + # ranks we must create empty grads, this will ensure that every + # rank is reducing the same size. In some cases it may make + # sense in the future to support the ability to average not + # w.r.t. world size but with a different value. + param.grad = torch.zeros(param.size(), + dtype=param.dtype, + device=param.device) + grads.append(param.grad.data) + else: + grad_data = param.grad.data + if self.sparse_gradients_enabled and param_name in self.csr_tensor_module_names: + grads.append(CSRTensor(grad_data)) + else: + grads.append(grad_data) + + split_buckets = split_half_float_double_csr(grads) + + for i, bucket_tuple in enumerate(split_buckets): + bucket_type, bucket = bucket_tuple + if bucket_type == CSRTensor.type(): + self.csr_allreduce_no_retain(bucket) + else: + self.allreduce_no_retain(bucket, numel_per_bucket=elements_per_buffer) + + def csr_allreduce_no_retain(self, bucket): + allreduced_csrs = self.csr_allreduce_bucket(bucket) + # Densify csr tensor and copy back to original location + for csr in allreduced_csrs: + dense_tensor = csr.to_dense() + csr.orig_dense_tensor.copy_(dense_tensor) + + def csr_allreduce_bucket(self, bucket): + csr_list = [] + for csr in bucket: + csr_list.append(self.csr_allreduce(csr)) + return csr_list + + def csr_allreduce(self, csr): + # Pre-divide for fp16 stability + csr.values.div_(self.dp_world_size) + + indices_device_list = self.csr_all_gather(csr.indices) + values_device_list = self.csr_all_gather(csr.values) + + csr.indices = torch.cat(indices_device_list) + csr.values = torch.cat(values_device_list) + return csr + + def csr_all_gather(self, value): + my_size = torch.LongTensor([value.size()[0]]).to(self.device) + all_sizes = self.all_gather_scalar(my_size) + max_size = torch.cat(all_sizes).max() + fill_size = (max_size - my_size) + + assert value.dim() in [1, 2] + if value.dim() == 1: + if fill_size > 0: + value = torch.cat([value, value.new_zeros(fill_size)]) + tensor_list = [value.new_zeros(max_size) for _ in range(self.dp_world_size)] + else: + if fill_size > 0: + value = torch.cat([value, value.new_zeros(fill_size, value.size()[1])]) + tensor_list = [ + value.new_zeros(max_size, + value.size()[1]) for _ in range(self.dp_world_size) + ] + + dist.all_gather(tensor_list, value, group=self.data_parallel_group) + tensors = [] + for dev_idx, t in enumerate(tensor_list): + size = all_sizes[dev_idx][0] + tensors.append( + t.index_select(0, + torch.LongTensor(range(size)).to(self.device))) + + return tensors + + def all_gather_scalar(self, value): + tensor_list = [value.new_zeros(value.size()) for _ in range(self.dp_world_size)] + dist.all_gather(tensor_list, value, group=self.data_parallel_group) + return tensor_list \ No newline at end of file diff --git a/allennlp/training/deepspeed/optimizers/__init__.py b/allennlp/training/deepspeed/optimizers/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/allennlp/training/deepspeed/optimizers/basic.py b/allennlp/training/deepspeed/optimizers/basic.py new file mode 100644 index 00000000000..d33ab90bad9 --- /dev/null +++ b/allennlp/training/deepspeed/optimizers/basic.py @@ -0,0 +1,51 @@ +from deepspeed.runtime.zero.utils import is_zero_supported_optimizer +from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer +from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer +from deepspeed.runtime.config import ( + DeepSpeedConfig, + ADAM_OPTIMIZER, + LAMB_OPTIMIZER, + ONEBIT_ADAM_OPTIMIZER, + DEEPSPEED_ADAM, + DEEPSPEED_OPTIMIZERS +) + +from apex.optimizers.fused_adam import FusedAdam +from deepspeed.ops.adam import DeepSpeedCPUAdam +from deepspeed.ops.lamb import FusedLamb + +# from allennlp.common import Registrable +from allennlp.training.optimizers import Optimizer + +# class DeepspeedOptimizer(Registrable): +# default_implementation = "fused_adam" + +# DeepspeedOptimizer.register('adam_cpu')(FP16_DeepSpeedZeroOptimizer_Stage1) +# DeepspeedOptimizer.register('fused_adam')(FusedAdam) +# DeepspeedOptimizer.register('deepspeed_adam')(DeepSpeedCPUAdam) +# DeepspeedOptimizer.register('one_bit_adam') +# DeepspeedOptimizer.register('lamb') + +# Optimizer.register('adam_cpu')(FP16_DeepSpeedZeroOptimizer_Stage1) +# Optimizer.register('fused_adam')(FusedAdam) +# Optimizer.register('deepspeed_cpu_adam')(DeepSpeedCPUAdam) +# Optimizer.register('lamb')(FusedLamb) + +@Optimizer.register('fused_adam', constructor='construct') +class DeepspeedFusedAdamOptimizer(Optimizer, FusedAdam): + # def __init__( + # self, + # model_parameters, + # **kwargs + # ): + # super().__init__(model_parameters, **kwargs) + + @staticmethod + def construct(model_parameters, **kwargs): + return FusedAdam(model_parameters, **kwargs) + +try: + from deepspeed.runtime.fp16.onebit_adam import OnebitAdam + Optimizer.register('one_bit_adam')(OnebitAdam) +except ImportError: + pass \ No newline at end of file diff --git a/allennlp/training/deepspeed/optimizers/fp16.py b/allennlp/training/deepspeed/optimizers/fp16.py new file mode 100644 index 00000000000..538ff2218d4 --- /dev/null +++ b/allennlp/training/deepspeed/optimizers/fp16.py @@ -0,0 +1,53 @@ +from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer +from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer +from deepspeed.runtime.fp16.onebit_adam import OnebitAdam + +from allennlp.common import Registrable, Lazy +from allennlp.training.optimizers import Optimizer + + +class DeepspeedFP16Optimizer(Registrable): + default_implementation = 'fused' + +@DeepspeedFP16Optimizer.register('fused', constructor='construct') +class DeepspeedFusedFP16Optimizer(DeepspeedFP16Optimizer): + @staticmethod + def construct( + init_optimizer: Optimizer, + mpu=None, + clip_grad=0.0, + static_loss_scale=1.0, + dynamic_loss_scale=False, + initial_dynamic_scale=2**32, + dynamic_loss_args=None, + fused_adam_legacy=False, + timers=None, + verbose=False + ): + if isinstance(optimizer, (apex.optimizers.FusedAdam, OnebitAdam)): + pass + +def _configure_fp16_optimizer(self, optimizer): + initial_dynamic_scale = self.initial_dynamic_scale() + dynamic_loss_args = self.dynamic_loss_scale_args() + + if isinstance(optimizer, apex.optimizers.FusedAdam) or self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER: + defaults['fused_adam_legacy'] = self.optimizer_legacy_fusion() + if self.dynamic_loss_scale(): + defaults.update(dict( + dynamic_loss_scale=True, + initial_dynamic_scale=initial_dynamic_scale, + dynamic_loss_args=dynamic_loss_args, + )) + else: + defaults.update(dict(static_loss_scale=self.loss_scale())) + optimizer = FP16_Optimizer(**defaults) + else: + optimizer = FP16_UnfusedOptimizer( + **defaults, + dynamic_loss_scale=self.dynamic_loss_scale(), + dynamic_loss_args=dynamic_loss_args, + fused_lamb_legacy=isinstance(optimizer, apex.optimizers.FusedLamb) + ) + # raise ValueError(optimizer) + return optimizer \ No newline at end of file diff --git a/allennlp/training/deepspeed/optimizers/zero_optimization.py b/allennlp/training/deepspeed/optimizers/zero_optimization.py new file mode 100644 index 00000000000..8d7dbf8af8e --- /dev/null +++ b/allennlp/training/deepspeed/optimizers/zero_optimization.py @@ -0,0 +1,127 @@ +from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer +from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1 +from deepspeed.runtime.zero.utils import is_zero_supported_optimizer +# from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer +# from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer +# from deepspeed.runtime.config import ( +# DeepSpeedConfig, +# ADAM_OPTIMIZER, +# LAMB_OPTIMIZER, +# ONEBIT_ADAM_OPTIMIZER, +# DEEPSPEED_ADAM, +# DEEPSPEED_OPTIMIZERS +# ) + +from allennlp.common import Registrable, Lazy +from allennlp.training.optimizers import Optimizer + + + +class DummyTimer: + class Timer: + def __init__(self, name): + pass + + def start(self): + pass + + def stop(self): + pass + + def reset(self): + pass + + def __init__(self): + self.timers = {} + + def __call__(self, name): + if name not in self.timers: + self.timers[name] = self.Timer(name) + return self.timers[name] + + def log(self, *args, **kwargs): + pass + + + +class ZeroOptimizer(Registrable): + default_implementation = "stage_2" # "disabled" + +@ZeroOptimizer.register('stage_1', constructor='construct') +class ZeroStage1Optimizer(ZeroOptimizer, FP16_DeepSpeedZeroOptimizer_Stage1): + stage = 1 + + @staticmethod + def construct( + init_optimizer: Optimizer, + dp_process_group=None, + mpu=None, + **kwargs, + ): + return FP16_DeepSpeedZeroOptimizer_Stage1( + init_optimizer, + timers=timers, + dp_process_group=dp_process_group, + mpu=mpu, + **kwargs + ) + + +@ZeroOptimizer.register('stage_2', constructor='construct') +class ZeroStage2Optimizer(ZeroOptimizer, FP16_DeepSpeedZeroOptimizer): + stage = 2 + + @staticmethod + def construct( + init_optimizer: Optimizer, + timers = DummyTimer(), + dp_process_group=None, + mpu=None, + static_loss_scale=1.0, + dynamic_loss_scale=False, + dynamic_loss_args=None, + verbose=False, + contiguous_gradients=True, + reduce_bucket_size=500000000, + allgather_bucket_size=5000000000, + reduce_scatter=True, + overlap_comm=False, + cpu_offload=False, + clip_grad=0.0, + allreduce_always_fp32=False, + postscale_gradients=True, + gradient_predivide_factor=1.0, + gradient_accumulation_steps=1 + ): + return FP16_DeepSpeedZeroOptimizer( + init_optimizer, + timers=timers, + dp_process_group=dp_process_group, + mpu=mpu, + dynamic_loss_scale=dynamic_loss_scale, + dynamic_loss_args=dynamic_loss_args, + verbose=verbose, + contiguous_gradients=contiguous_gradients, + reduce_bucket_size=reduce_bucket_size, + allgather_bucket_size=allgather_bucket_size, + reduce_scatter=reduce_scatter, + overlap_comm=overlap_comm, + cpu_offload=cpu_offload, + clip_grad=clip_grad, + allreduce_always_fp32=allreduce_always_fp32, + postscale_gradients=postscale_gradients, + gradient_predivide_factor=gradient_predivide_factor, + gradient_accumulation_steps=gradient_accumulation_steps + ) + +# @ZeroOptimizer.register('stage_2') +# class ZeroStage2Optimizer(FP16_DeepSpeedZeroOptimizer): +# def __init__(self, init_optimizer=None, timers=DummyTimer(), **kwargs): +# print('!!!!!!!!!!!!!!!') +# print(kwargs) +# assert init_optimizer is not None, init_optimizer +# super().__init__(init_optimizer, timers=timers, **kwargs) + + +# ZeroOptimizer.register('stage_1')(FP16_DeepSpeedZeroOptimizer_Stage1) +# ZeroOptimizer.register('stage_2')(FP16_DeepSpeedZeroOptimizer) \ No newline at end of file diff --git a/allennlp/training/deepspeed_engine_adapter.py b/allennlp/training/deepspeed_engine_adapter.py deleted file mode 100644 index e28b39d3240..00000000000 --- a/allennlp/training/deepspeed_engine_adapter.py +++ /dev/null @@ -1,1130 +0,0 @@ -''' -Copyright 2019 The Microsoft DeepSpeed Team -''' - -import os -import torch -import warnings -import torch.distributed as dist - -import apex -from apex import amp -from torch.nn.modules import Module -from torch.distributed.distributed_c10d import _get_global_rank - -from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer -from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1 -from deepspeed.runtime.zero.utils import is_zero_supported_optimizer -from deepspeed.runtime.activation_checkpointing import checkpointing as activation_checkpointing -from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer -from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer -from deepspeed.runtime.config import DeepSpeedConfig, \ - ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, DEEPSPEED_ADAM, DEEPSPEED_OPTIMIZERS -from deepspeed.runtime.dataloader import DeepSpeedDataLoader -from deepspeed.runtime.constants import \ - ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \ - TORCH_DISTRIBUTED_DEFAULT_PORT -from deepspeed.runtime.zero.constants import \ - ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS -from deepspeed.runtime.csr_tensor import CSRTensor -import deepspeed.runtime.lr_schedules as lr_schedules -from deepspeed.utils import logger, log_dist - -MEMORY_OPT_ALLREDUCE_SIZE = 500000000 -SUMMARY_WRITER_DIR_NAME = "JobId" - -try: - from apex_C import flatten - from apex_C import unflatten -except ImportError: - try: - _ = warned_flatten - except NameError: - logger.warning( - "Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten." - ) - warned_flatten = True - from torch._utils import _flatten_dense_tensors as flatten - from torch._utils import _unflatten_dense_tensors as unflatten - - -class DummyTimer: - class Timer: - def __init__(self, name): - pass - - def start(self): - pass - - def stop(self): - pass - - def reset(self): - pass - - def __init__(self): - self.timers = {} - - def __call__(self, name): - if name not in self.timers: - self.timers[name] = self.Timer(name) - return self.timers[name] - - def log(self, *args, **kwargs): - pass - - - -def split_half_float_double_csr(tensors): - dtypes = [ - "torch.cuda.HalfTensor", - "torch.cuda.FloatTensor", - "torch.cuda.DoubleTensor", - CSRTensor.type() - ] - buckets = [] - for i, dtype in enumerate(dtypes): - bucket = [t for t in tensors if t.type() == dtype] - if bucket: - buckets.append((dtype, bucket)) - return buckets - - -def _initialize_parameter_parallel_groups(parameter_parallel_size=None): - data_parallel_size = int(dist.get_world_size()) - if parameter_parallel_size is None: - parameter_parallel_size = int(data_parallel_size) - logger.info("data_parallel_size: %s, parameter_parallel_size: %s", - data_parallel_size, - parameter_parallel_size) - assert data_parallel_size % parameter_parallel_size == 0, \ - 'world size should be divisible by parameter parallel size' - rank = dist.get_rank() - my_group = None - for i in range(dist.get_world_size() // parameter_parallel_size): - ranks = range(i * parameter_parallel_size, (i + 1) * parameter_parallel_size) - group = torch.distributed.new_group(ranks) - if rank in ranks: - my_group = group - return my_group - - -class AllennlpDeepSpeedEngineAdapter(Module): - r"""DeepSpeed engine for training. - """ - def __init__(self, - args, - model, - optimizer=None, - model_parameters=None, - training_data=None, - lr_scheduler=None, - mpu=None, - dist_init_required=None, - collate_fn=None, - config_params=None): - super().__init__() - self.client_optimizer = optimizer - self.client_model_parameters = model_parameters - self.client_lr_scheduler = lr_scheduler - self.training_data = training_data - self.collate_fn = collate_fn - self.mpu = mpu - self.data_parallel_group = None - self.global_steps = 0 - self.global_samples = 0 - self.micro_steps = 0 - self.skipped_steps = 0 - self.gradient_average = True - self.warn_unscaled_loss = True - self.config_params = config_params - self.loaded_checkpoint_mp_world_size = None - self.loaded_checkpoint_dp_world_size = None - self.enable_backward_allreduce = True - - if dist_init_required is None: - dist_init_required = not dist.is_initialized() - - self.dist_backend = "nccl" - if dist_init_required: - if not dist.is_initialized(): - logger.info("Initializing torch distributed with backend: {}".format( - self.dist_backend)) - dist.init_process_group(backend=self.dist_backend) - else: - logger.warning( - "Was given dist_init_required=True but detected that torch" - "distributed was already initialized, cannot initialize twice.") - - self._configure_with_arguments(args, mpu) - - self._init_distributed(dist_init_required) - - # Configure distributed model - self._configure_distributed_model(model) - - if training_data: - self.training_dataloader = self.deepspeed_io(training_data) - else: - self.training_dataloader = None - - # Configure optimizer and scheduler - self.optimizer = None - self.lr_scheduler = None - if model_parameters or optimizer: - self._configure_optimizer(optimizer, model_parameters) - self._configure_lr_scheduler(lr_scheduler) - self._report_progress(0) - - # Bookkeeping for csr support - self.csr_tensor_module_names = set() - if self.sparse_gradients_enabled(): - for name, module in self.module.named_modules(): - if isinstance(module, torch.nn.Embedding): - self.csr_tensor_module_names.add(name + ".weight") - logger.info("Will convert {} to sparse (csr) " - "tensor during training".format(name)) - - self.save_non_zero_checkpoint = False - self.save_zero_checkpoint = False - self._configure_checkpointing(dist_init_required) - - if self.global_rank == 0: - self._config.print('DeepSpeedLight configuration') - - def wall_clock_breakdown(self): - return self._config.wall_clock_breakdown - - def memory_breakdown(self): - return self._config.memory_breakdown - - def sparse_gradients_enabled(self): - return self._config.sparse_gradients_enabled - - def train_batch_size(self): - return self._config.train_batch_size - - def train_micro_batch_size_per_gpu(self): - return self._config.train_micro_batch_size_per_gpu - - def optimizer_name(self): - return self.client_optimizer.__class__.__name__ if self.client_optimizer else self._config.optimizer_name - - def optimizer_params(self): - return self._config.optimizer_params - - def optimizer_legacy_fusion(self): - return self._config.optimizer_legacy_fusion - - def scheduler_name(self): - return self._config.scheduler_name - - def scheduler_params(self): - return self._config.scheduler_params - - def zero_optimization(self): - return self._config.zero_enabled - - def zero_allow_untested_optimizer(self): - return self._config.zero_allow_untested_optimizer - - def zero_reduce_scatter(self): - return self._config.zero_config.reduce_scatter - - def zero_overlap_comm(self): - return self._config.zero_config.overlap_comm - - def zero_cpu_offload(self): - return self._config.zero_config.cpu_offload - - def zero_optimization_stage(self): - return self._config.zero_optimization_stage - - def zero_reduce_bucket_size(self): - return self._config.zero_config.reduce_bucket_size - - def zero_allgather_bucket_size(self): - return self._config.zero_config.allgather_bucket_size - - def zero_optimization_partition_gradients(self): - return self.zero_optimization_stage() >= ZERO_OPTIMIZATION_GRADIENTS - - def zero_contiguous_gradients(self): - return self._config.zero_config.contiguous_gradients - - def zero_load_from_fp32_weights(self): - return self._config.zero_config.load_from_fp32_weights - - def fp16_enabled(self): - return self._config.fp16_enabled - - def amp_enabled(self): - return self._config.amp_enabled - - def amp_params(self): - return self._config.amp_params - - def loss_scale(self): - return self._config.loss_scale - - def gradient_accumulation_steps(self): - return self._config.gradient_accumulation_steps - - def allreduce_always_fp32(self): - return self._config.allreduce_always_fp32 - - def postscale_gradients(self): - return not self._config.prescale_gradients - - def gradient_predivide_factor(self): - return self._config.gradient_predivide_factor - - def steps_per_print(self): - return self._config.steps_per_print - - def zero_allgather_partitions(self): - return self._config.zero_config.allgather_partitions - - def dump_state(self): - return self._config.dump_state - - def gradient_clipping(self): - return self._config.gradient_clipping - - def dynamic_loss_scale(self): - return self._config.loss_scale == 0 - - def initial_dynamic_scale(self): - return self._config.initial_dynamic_scale - - def dynamic_loss_scale_args(self): - return self._config.dynamic_loss_scale_args - - def _configure_lr_scheduler(self, client_lr_scheduler): - # First check for scheduler in json configuration - lr_scheduler = self._scheduler_from_config(self.optimizer) - if lr_scheduler: - if self.global_rank == 0: - logger.info( - f'DeepSpeed using configured LR scheduler = {self.scheduler_name()}') - self.lr_scheduler = lr_scheduler - else: - if self.global_rank == 0: - logger.info('DeepSpeed using client LR scheduler') - self.lr_scheduler = client_lr_scheduler - log_dist(f'DeepSpeed LR Scheduler = {self.lr_scheduler}', ranks=[0]) - - def _configure_checkpointing(self, dist_init_required): - - dp_rank = self.global_rank - if self.mpu: - dp_rank = self.mpu.get_data_parallel_rank() - - # only the first data parallel process needs to store the model checkpoint - self.save_non_zero_checkpoint = (dp_rank == 0) - - if self.zero_optimization(): - param_rank = torch.distributed.get_rank( - group=self.optimizer.dp_process_group) - - # Only the first parameter parallel process needs to store the - # optimizer state checkpoints for zero - self.save_zero_checkpoint = (param_rank == dp_rank) - - def _scheduler_from_config(self, optimizer): - scheduler_name = self.scheduler_name() - if scheduler_name is not None: - if hasattr(lr_schedules, scheduler_name): - scheduler = getattr(lr_schedules, scheduler_name) - else: - assert hasattr(torch.optim.lr_scheduler, scheduler_name), \ - f"DeepSpeed does not recognize LR scheduler {scheduler_name}" - - scheduler = getattr(torch.optim.lr_scheduler, scheduler_name) - - scheduler_params = self.scheduler_params() - instantiated_scheduler = scheduler(optimizer, **scheduler_params) - return instantiated_scheduler - else: - return None - - def _init_distributed(self, dist_init_required): - if self.local_rank >= 0: - torch.cuda.set_device(self.local_rank) - self.device = torch.device("cuda", self.local_rank) - self.world_size = dist.get_world_size() - self.global_rank = dist.get_rank() - else: - self.world_size = 1 - self.global_rank = 0 - self.device = torch.device("cuda") - - # Configure based on command line arguments - def _configure_with_arguments(self, args, mpu): - self.local_rank = args.local_rank if hasattr(args, 'local_rank') else 0 - self._config = DeepSpeedConfig(args.deepspeed_config, - mpu, - param_dict=self.config_params) - - - def _is_supported_optimizer(self, optimizer_name): - return optimizer_name in DEEPSPEED_OPTIMIZERS or \ - getattr(torch.optim, optimizer_name, None) is not None - - def _broadcast_model(self): - for p in self.module.parameters(): - if torch.is_tensor(p): - dist.broadcast(p, - self.broadcast_src_rank, - group=self.data_parallel_group) - - def _configure_distributed_model(self, model): - self.module = model - if self.fp16_enabled(): - self.module.half() - self.module.to(self.device) - - if self.mpu is None: - self.data_parallel_group = _initialize_parameter_parallel_groups() - self.dp_world_size = dist.get_world_size() - self.mp_world_size = 1 - self.broadcast_src_rank = 0 - else: - self.data_parallel_group = self.mpu.get_data_parallel_group() - self.dp_world_size = self.mpu.get_data_parallel_world_size() - self.mp_world_size = self.mpu.get_model_parallel_world_size() - self.broadcast_src_rank = _get_global_rank( - self.mpu.get_data_parallel_group(), - 0) - - if not self.amp_enabled(): - self._broadcast_model() - - # Configure optimizer - def _configure_optimizer(self, client_optimizer, model_parameters): - - if client_optimizer is not None: - basic_optimizer = client_optimizer - if self.global_rank == 0: - logger.info('Using client Optimizer as basic optimizer') - else: - basic_optimizer = self._configure_basic_optimizer(model_parameters) - if self.global_rank == 0: - logger.info( - 'Using DeepSpeed Optimizer param name {} as basic optimizer'.format( - self.optimizer_name())) - - if self.global_rank == 0: - logger.info('DeepSpeed Basic Optimizer = {}'.format(basic_optimizer)) - - if self.zero_optimization(): - assert not self.amp_enabled(), "Amp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2" - if not is_zero_supported_optimizer(basic_optimizer): - assert self.zero_allow_untested_optimizer(), \ - 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.' - - if self.global_rank == 0: - logger.warning( - "**** You are using ZeRO with an untested optimizer, proceed with caution *****" - ) - self.optimizer = self._configure_zero_optimizer(basic_optimizer) - elif self.amp_enabled(): - assert not self.fp16_enabled(), "Cannot enable both amp with (legacy) fp16 mode" - amp_params = self.amp_params() - if self.global_rank == 0: - logger.info(f"Initializing AMP with these params: {amp_params}") - self.module, self.optimizer = amp.initialize(self.module, basic_optimizer, **amp_params) - self._broadcast_model() - elif self.fp16_enabled(): - self.optimizer = self._configure_fp16_optimizer(basic_optimizer) - else: - self.optimizer = basic_optimizer - logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer)) - logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer.state_dict())) - - def _configure_basic_optimizer(self, model_parameters): - optimizer_parameters = self.optimizer_params() - # print(optimizer_parameters.keys()) - if 'max_grad_norm' in optimizer_parameters.keys(): - raise ValueError( - "'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details" - ) - if self.optimizer_name() == ADAM_OPTIMIZER: - if self.zero_cpu_offload(): - optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters) - else: - from apex.optimizers.fused_adam import FusedAdam - optimizer = FusedAdam(model_parameters, **optimizer_parameters) - elif self.optimizer_name() == DEEPSPEED_ADAM: - from deepspeed.ops.adam import DeepSpeedCPUAdam - optimizer = DeepSpeedCPUAdam(model_parameters, **optimizer_parameters) - elif self.optimizer_name() == LAMB_OPTIMIZER: - from deepspeed.ops.lamb import FusedLamb - optimizer = FusedLamb(model_parameters, **optimizer_parameters) - elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER: - from deepspeed.runtime.fp16.onebit_adam import OnebitAdam - optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters) - else: - torch_optimizer = getattr(torch.optim, self.optimizer_name()) - optimizer = torch_optimizer(model_parameters, **optimizer_parameters) - return optimizer - - def _configure_fp16_optimizer(self, optimizer): - initial_dynamic_scale = self.initial_dynamic_scale() - dynamic_loss_args = self.dynamic_loss_scale_args() - clip_grad = self.gradient_clipping() - if isinstance(optimizer, - apex.optimizers.FusedAdam) or self.optimizer_name( - ) == ONEBIT_ADAM_OPTIMIZER: - if self.dynamic_loss_scale(): - logger.info('Creating fp16 optimizer with dynamic loss scale') - optimizer = FP16_Optimizer( - optimizer, - dynamic_loss_scale=True, - initial_dynamic_scale=initial_dynamic_scale, - dynamic_loss_args=dynamic_loss_args, - mpu=self.mpu, - clip_grad=clip_grad, - fused_adam_legacy=self.optimizer_legacy_fusion(), - timers=None) - else: - logger.info('Creating fp16 optimizer with static loss scale: {}'.format( - self.loss_scale())) - optimizer = FP16_Optimizer( - optimizer, - static_loss_scale=self.loss_scale(), - mpu=self.mpu, - clip_grad=clip_grad, - fused_adam_legacy=self.optimizer_legacy_fusion()) - else: - logger.info('Creating fp16 unfused optimizer with dynamic loss scale') - optimizer = FP16_UnfusedOptimizer( - optimizer, - dynamic_loss_scale=self.dynamic_loss_scale(), - dynamic_loss_args=dynamic_loss_args, - mpu=self.mpu, - clip_grad=clip_grad, - fused_lamb_legacy=self.optimizer_name() == LAMB_OPTIMIZER) - - return optimizer - - def _configure_zero_optimizer(self, optimizer): - zero_stage = self.zero_optimization_stage() - logger.info('Creating fp16 ZeRO stage {} optimizer'.format(zero_stage)) - - if zero_stage == ZERO_OPTIMIZATION_OPTIMIZER_STATES: - assert self.zero_reduce_scatter(), 'Stage 1 only supports reduce scatter mode' - optimizer = FP16_DeepSpeedZeroOptimizer_Stage1( - optimizer, - static_loss_scale=self.loss_scale(), - dynamic_loss_scale=self.dynamic_loss_scale(), - dynamic_loss_args=self.dynamic_loss_scale_args(), - clip_grad=self.gradient_clipping(), - all_gather_partitions=self.zero_allgather_partitions(), - allgather_size=self.zero_allgather_bucket_size(), - max_elements_per_comm=self.zero_reduce_bucket_size(), - dp_process_group=self.data_parallel_group, - mpu=self.mpu) - elif zero_stage == ZERO_OPTIMIZATION_GRADIENTS: - optimizer = FP16_DeepSpeedZeroOptimizer( - optimizer, - timers=DummyTimer(), # None, - static_loss_scale=self.loss_scale(), - dynamic_loss_scale=self.dynamic_loss_scale(), - dynamic_loss_args=self.dynamic_loss_scale_args(), - clip_grad=self.gradient_clipping(), - contiguous_gradients=self.zero_contiguous_gradients(), - reduce_bucket_size=self.zero_reduce_bucket_size(), - allgather_bucket_size=self.zero_allgather_bucket_size(), - dp_process_group=self.data_parallel_group, - reduce_scatter=self.zero_reduce_scatter(), - overlap_comm=self.zero_overlap_comm(), - cpu_offload=self.zero_cpu_offload(), - mpu=self.mpu, - postscale_gradients=self.postscale_gradients(), - gradient_predivide_factor=self.gradient_predivide_factor(), - gradient_accumulation_steps=self.gradient_accumulation_steps(), - verbose=False - ) - else: - raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage)) - - return optimizer - - def deepspeed_io(self, - dataset, - batch_size=None, - route=ROUTE_TRAIN, - pin_memory=True, - data_sampler=None, - collate_fn=None, - num_local_io_workers=None): - if not isinstance(dataset, torch.utils.data.Dataset): - raise ValueError("Training data must be a torch Dataset") - - if data_sampler is None and (route == ROUTE_PREDICT or route == ROUTE_EVAL): - data_sampler = torch.utils.data.SequentialSampler(dataset) - - if batch_size is None: - batch_size = self.train_micro_batch_size_per_gpu() - - if collate_fn is None: - collate_fn = self.collate_fn - - # If mpu is provied, forward world size and parallel rank to sampler. - data_parallel_world_size = None - data_parallel_rank = None - if self.mpu is not None: - data_parallel_world_size = self.mpu.get_data_parallel_world_size() - data_parallel_rank = self.mpu.get_data_parallel_rank() - - return DeepSpeedDataLoader(dataset=dataset, - batch_size=batch_size, - pin_memory=pin_memory, - collate_fn=collate_fn, - local_rank=self.local_rank, - tput_timer=None, - num_local_io_workers=num_local_io_workers, - data_sampler=data_sampler, - data_parallel_world_size=data_parallel_world_size, - data_parallel_rank=data_parallel_rank) - - def train(self): - r""" - """ - - self.warn_unscaled_loss = True - self.module.train() - - def eval(self): - r""" - """ - - self.warn_unscaled_loss = True - self.module.train(False) - - def _scale_loss(self, prescaled_loss): - if isinstance(prescaled_loss, torch.Tensor): - scaled_loss = prescaled_loss / self.gradient_accumulation_steps() - elif isinstance(prescaled_loss, tuple) or isinstance(prescaled_loss, list): - scaled_loss = [] - for l in prescaled_loss: - if isinstance(l, torch.Tensor): - scaled_loss.append(l / self.gradient_accumulation_steps()) - else: - scaled_loss.append(l) - else: - scaled_loss = prescaled_loss - if self.warn_unscaled_loss: - logger.warning( - f'DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}' - ) - self.warn_unscaled_loss = False - - return scaled_loss - - def forward(self, *inputs, **kwargs): - r"""Execute forward propagation - - Arguments: - *inputs: Variable length input list - **kwargs: variable length keyword arguments - """ - loss = self.module(*inputs, **kwargs) - return loss - - def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): - #Zero stage 2 communicates during non gradient accumulation boundaries as well - if self.zero_optimization_partition_gradients(): - self.optimizer.overlapping_partition_gradients_reduce_epilogue() - - #Communicate only at gradient accumulation boundaries - elif self.is_gradient_accumulation_boundary(): - if self.zero_optimization_stage() == ZERO_OPTIMIZATION_OPTIMIZER_STATES: - assert self.zero_reduce_scatter() - self.optimizer.reduce_scatter_gradients( - postscale_gradients=self.postscale_gradients(), - gradient_predivide_factor=self.gradient_predivide_factor(), - gradient_average=self.gradient_average) - else: - self.buffered_allreduce_fallback(elements_per_buffer=bucket_size) - - def backward(self, loss, allreduce_gradients=True, release_loss=False): - r"""Execute backward pass on the loss - - Arguments: - loss: Torch tensor on which to execute backward propagation - allreduce_gradients: If this is False, then gradient averaging will be skipped. Default is True. - """ - - # scale loss w.r.t. gradient accumulation if needed - if self.gradient_accumulation_steps() > 1: - loss = self._scale_loss(loss.float()) - - assert self.optimizer is not None, "must provide optimizer during " \ - "init in order to use backward" - - if self.zero_optimization(): - self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary( - ) - self.optimizer.backward(loss) - elif self.amp_enabled(): - # AMP requires delaying unscale when inside gradient accumulation boundaries - # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations - delay_unscale = not self.is_gradient_accumulation_boundary() - with amp.scale_loss(loss, - self.optimizer, - delay_unscale=delay_unscale) as scaled_loss: - scaled_loss.backward() - elif self.fp16_enabled(): - self.optimizer.backward(loss) - else: - loss.backward() - - if allreduce_gradients and self.enable_backward_allreduce: - self.allreduce_gradients() - - if release_loss: - # loss.data = None - pass - - return loss - - def is_gradient_accumulation_boundary(self): - """Query whether the current micro-batch is at the boundary of - gradient accumulation, and thus will trigger gradient reductions and - an optimizer step. - - Returns: - bool: if the current step is a gradient accumulation boundary. - """ - return (self.micro_steps + 1) % \ - self.gradient_accumulation_steps() == 0 - - def zero_grad(self): - """ - Zero parameter grads. - """ - for param_name, param in self.module.named_parameters(): - param.grad = None - - def clip_fp32_gradients(self): - torch.nn.utils.clip_grad_norm_(parameters=self.module.parameters(), - max_norm=self.gradient_clipping()) - - def _take_model_step(self): - if self.gradient_clipping() > 0.0: - if not self.fp16_enabled() and not self.amp_enabled(): - self.clip_fp32_gradients() - elif self.amp_enabled(): - # AMP's recommended way of doing clipping - # https://nvidia.github.io/apex/advanced.html#gradient-clipping - master_params = amp.master_params(self.optimizer) - torch.nn.utils.clip_grad_norm_(parameters=master_params, - max_norm=self.gradient_clipping()) - self.optimizer.step() - - #zero grad in basic optimizer could be unreliable and may not exhibit - #the behaviour that we want - if not self.zero_optimization() and not self.fp16_enabled( - ) and not self.amp_enabled(): - self.zero_grad() - else: - self.optimizer.zero_grad() - - report_progress = self.global_rank == 0 if self.global_rank else True - - # Check overlow here since in DS fp16 optimizer, the overflow is updated in above step() function. - overflow = False - if hasattr(self.optimizer, 'overflow'): - overflow = self.optimizer.overflow - - if overflow: - self.skipped_steps += 1 - else: - if self.lr_scheduler is not None: - self.lr_scheduler.step() - if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0: - self._report_progress(self.global_steps + 1) - - self.global_steps += 1 - self.global_samples += self.train_batch_size() - - def step(self): - r"""Execute the weight update step after forward and backward propagation - on effective_train_batch. - """ - - assert self.optimizer is not None, "must provide optimizer during " \ - "init in order to use step" - report_progress = self.global_rank == 0 if self.global_rank else True - - # Update the model when we reach gradient accumulation boundaries - if self.is_gradient_accumulation_boundary(): - self._take_model_step() - - self.micro_steps += 1 - - def _get_optimizer_param(self, param_name): - result = [] - if not self.optimizer: - return result - for group in self.optimizer.param_groups: - if param_name in group: - result.append(group[param_name]) - else: - result.append(0.0) - return result - - def get_lr(self): - return self._get_optimizer_param('lr') - - def get_type(self): - return self._get_optimizer_param('type') - - def get_mom(self): - return self._get_optimizer_param('betas') - - def _report_progress(self, step): - lr = self.get_lr() - mom = self.get_mom() - log_dist(f'step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}', - ranks=[0]) - - def allreduce_bucket(self, bucket): - tensor = flatten(bucket) - - tensor_to_allreduce = tensor - - if self.allreduce_always_fp32(): - tensor_to_allreduce = tensor.float() - - if self.postscale_gradients(): - if self.gradient_predivide_factor() != 1.0: - tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor()) - - dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group) - - if self.gradient_average: - if self.gradient_predivide_factor() != self.dp_world_size: - tensor_to_allreduce.mul_(self.gradient_predivide_factor() / - self.dp_world_size) - else: - tensor_to_allreduce.div_(self.dp_world_size) - dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group) - - if self.allreduce_always_fp32() and tensor is not tensor_to_allreduce: - tensor.copy_(tensor_to_allreduce) - - return tensor - - def allreduce_and_copy(self, small_bucket): - allreduced = self.allreduce_bucket(small_bucket) - for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)): - buf.copy_(synced) - - def allreduce_no_retain(self, bucket, numel_per_bucket=500000000): - small_bucket = [] - numel = 0 - for tensor in bucket: - small_bucket.append(tensor) - numel = numel + tensor.numel() - if numel > numel_per_bucket: - self.allreduce_and_copy(small_bucket) - small_bucket = [] - numel = 0 - if len(small_bucket) > 0: - self.allreduce_and_copy(small_bucket) - - def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000): - grads = [] - for param_name, param in self.module.named_parameters(): - if param.grad is None: - # In cases where there is an imbalance of empty grads across - # ranks we must create empty grads, this will ensure that every - # rank is reducing the same size. In some cases it may make - # sense in the future to support the ability to average not - # w.r.t. world size but with a different value. - param.grad = torch.zeros(param.size(), - dtype=param.dtype, - device=param.device) - grads.append(param.grad.data) - else: - grad_data = param.grad.data - if self.sparse_gradients_enabled( - ) and param_name in self.csr_tensor_module_names: - grads.append(CSRTensor(grad_data)) - else: - grads.append(grad_data) - - split_buckets = split_half_float_double_csr(grads) - - for i, bucket_tuple in enumerate(split_buckets): - bucket_type, bucket = bucket_tuple - if bucket_type == CSRTensor.type(): - self.csr_allreduce_no_retain(bucket) - else: - self.allreduce_no_retain(bucket, numel_per_bucket=elements_per_buffer) - - def csr_allreduce_no_retain(self, bucket): - allreduced_csrs = self.csr_allreduce_bucket(bucket) - # Densify csr tensor and copy back to original location - for csr in allreduced_csrs: - dense_tensor = csr.to_dense() - csr.orig_dense_tensor.copy_(dense_tensor) - - def csr_allreduce_bucket(self, bucket): - csr_list = [] - for csr in bucket: - csr_list.append(self.csr_allreduce(csr)) - return csr_list - - def csr_allreduce(self, csr): - # Pre-divide for fp16 stability - csr.values.div_(self.dp_world_size) - - indices_device_list = self.csr_all_gather(csr.indices) - values_device_list = self.csr_all_gather(csr.values) - - csr.indices = torch.cat(indices_device_list) - csr.values = torch.cat(values_device_list) - return csr - - def csr_all_gather(self, value): - my_size = torch.LongTensor([value.size()[0]]).to(self.device) - all_sizes = self.all_gather_scalar(my_size) - max_size = torch.cat(all_sizes).max() - fill_size = (max_size - my_size) - - assert value.dim() in [1, 2] - if value.dim() == 1: - if fill_size > 0: - value = torch.cat([value, value.new_zeros(fill_size)]) - tensor_list = [value.new_zeros(max_size) for _ in range(self.dp_world_size)] - else: - if fill_size > 0: - value = torch.cat([value, value.new_zeros(fill_size, value.size()[1])]) - tensor_list = [ - value.new_zeros(max_size, - value.size()[1]) for _ in range(self.dp_world_size) - ] - - dist.all_gather(tensor_list, value, group=self.data_parallel_group) - tensors = [] - for dev_idx, t in enumerate(tensor_list): - size = all_sizes[dev_idx][0] - tensors.append( - t.index_select(0, - torch.LongTensor(range(size)).to(self.device))) - - return tensors - - def all_gather_scalar(self, value): - tensor_list = [value.new_zeros(value.size()) for _ in range(self.dp_world_size)] - dist.all_gather(tensor_list, value, group=self.data_parallel_group) - return tensor_list - - def module_state_dict(self, destination=None, prefix='', keep_vars=False): - sd = self.module.state_dict(destination, prefix, keep_vars) - return sd - - def load_module_state_dict(self, state_dict, strict=True): - self.module.load_state_dict(state_dict, strict=strict) - - def _get_rank_zero_ckpt_name(self, checkpoints_path, tag, mp_rank, dp_rank): - filename = 'zero_pp_rank_{}'.format(dp_rank) - zero_ckpt_name = os.path.join( - checkpoints_path, - str(tag), - filename + '_mp_rank_{:02d}'.format(mp_rank) + 'optim_states.pt') - return zero_ckpt_name - - def _get_zero_ckpt_name(self, checkpoints_path, tag): - mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() - pp_rank = torch.distributed.get_rank(group=self.optimizer.dp_process_group) - return self._get_rank_zero_ckpt_name(checkpoints_path, tag, mp_rank, pp_rank) - - def _get_ckpt_name(self, checkpoints_path, tag): - mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() - ckpt_name = os.path.join(checkpoints_path, - str(tag), - 'mp_rank_{:02d}'.format(mp_rank) + '_model_states.pt') - return ckpt_name - - def load_checkpoint(self, - load_dir, - tag, - load_module_strict=True, - load_optimizer_states=True, - load_lr_scheduler_states=True): - r"""Load training checkpoint - - Arguments: - load_dir: Required. Directory to load the checkpoint from - tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step. - load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and checkpoint match. - load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint. Ex. ADAM's momentum and variance - load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint. - Return: - load_path: Path of the loaded checkpoint. None if loading the checkpoint failed - client_state: State dictionary used for loading required training states in the client code. - """ - - load_path, client_states = self._load_checkpoint(load_dir, - tag, - load_module_strict=load_module_strict, - load_optimizer_states=load_optimizer_states, - load_lr_scheduler_states=load_lr_scheduler_states) - - if self.zero_optimization() and load_path is not None: - self._load_zero_checkpoint(load_dir, - tag, - load_optimizer_states=load_optimizer_states) - - return load_path, client_states - - def _load_checkpoint(self, - load_dir, - tag, - load_module_strict=True, - load_optimizer_states=True, - load_lr_scheduler_states=True): - - load_path = self._get_ckpt_name(load_dir, tag) - - if not os.path.exists(load_path): - logger.warn( - 'Client provided checkpoint load path: {} does not exist ... skip checkpoint load' - .format(load_path)) - return None, None - - logger.info(f'rank: {self.global_rank} loading checkpoint: {load_path}') - checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage) - - self.load_module_state_dict(state_dict=checkpoint['module'], - strict=load_module_strict) - if not self.zero_optimization(): - if self.fp16_enabled(): - self.optimizer.load_state_dict( - checkpoint['optimizer'], - load_optimizer_states=load_optimizer_states) - elif load_optimizer_states: - self.optimizer.load_state_dict(checkpoint['optimizer']) - - if load_lr_scheduler_states and self.lr_scheduler is not None: - self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) - - self.csr_tensor_module_names = checkpoint['csr_tensor_module_names'] - self.global_steps = checkpoint['global_steps'] - self.global_samples = checkpoint.get('global_samples', - self.global_steps * self.train_batch_size()) - self.skipped_steps = checkpoint['skipped_steps'] - self.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size'] - self.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size'] - deepspeed_states = [ - 'module', - 'optimizer', - 'lr_scheduler', - 'csr_tensor_module_names', - 'skipped_steps', - 'global_steps', - 'dp_world_size', - 'mp_world_size' - ] - client_state = { - key: value - for key, - value in checkpoint.items() if not key in deepspeed_states - } - - return load_path, client_state - - def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True): - zero_sd_list = self._get_all_zero_checkpoints(load_dir, tag) - if zero_sd_list is None: - return - - self.optimizer.load_state_dict( - state_dict_list=zero_sd_list, - load_optimizer_states=load_optimizer_states, - load_from_fp32_weights=self.zero_load_from_fp32_weights()) - print( - f'loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}' - ) - - def _get_mp_rank_zero_checkpoint_names(self, load_dir, tag, mp_rank, dp_world_size): - zero_ckpt_names = [] - for dp_rank in range(dp_world_size): - ckpt_name = self._get_rank_zero_ckpt_name(checkpoints_path=load_dir, - tag=tag, - mp_rank=mp_rank, - dp_rank=dp_rank) - zero_ckpt_names.append(ckpt_name) - - return zero_ckpt_names - - def _get_all_zero_checkpoint_names(self, - load_dir, - tag, - mp_world_size, - dp_world_size): - zero_ckpt_names = [] - for mp_rank in range(mp_world_size): - mp_rank_ckpt_names = self._get_mp_rank_zero_checkpoint_names( - load_dir=load_dir, - tag=tag, - mp_rank=mp_rank, - dp_world_size=dp_world_size) - zero_ckpt_names += mp_rank_ckpt_names - - return zero_ckpt_names - - def _get_all_zero_checkpoints(self, load_dir, tag): - mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank() - zero_ckpt_names = self._get_mp_rank_zero_checkpoint_names( - load_dir=load_dir, - tag=tag, - mp_rank=mp_rank, - dp_world_size=self.loaded_checkpoint_dp_world_size) - invalid_zero_ckpt_paths = [] - for ckpt_name in zero_ckpt_names: - if not os.path.exists(ckpt_name): - invalid_zero_ckpt_paths.append(ckpt_name) - - if len(invalid_zero_ckpt_paths) > 0: - logger.warn( - f"Client provided zero checkpoint load paths: {invalid_zero_ckpt_paths} does not exist" - ) - return None - - zero_sd_list = [] - for ckpt_name in zero_ckpt_names: - zero_sd_list.append(torch.load(ckpt_name, map_location='cpu')) - - zero_optimizer_sd = [sd['optimizer_state_dict'] for sd in zero_sd_list] - print( - f"successfully loaded {len(zero_optimizer_sd)} ZeRO state_dicts for rank {self.global_rank}" - ) - return zero_optimizer_sd - - def save_checkpoint(self, save_dir, tag, client_state={}): - r"""Save training checkpoint - - Arguments: - save_dir: Required. Directory for saving the checkpoint - tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step. - client_state: Optional. State dictionary used for saving required training states in the client code. - """ - - # This is to make sure the checkpoint names are created without collision - # There seems to be issue creating them in parallel - - if self.save_non_zero_checkpoint: - self._create_checkpoint_file(save_dir, tag, False) - self._save_checkpoint(save_dir, tag, client_state=client_state) - - if self.save_zero_checkpoint: - self._create_zero_checkpoint_files(save_dir, tag) - self._save_zero_checkpoint(save_dir, tag) - - return True \ No newline at end of file From d0e8a680a787f09237a3209c071848bc43bc552e Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Wed, 28 Oct 2020 13:01:00 -0400 Subject: [PATCH 06/20] debugging memory leak --- .../training/deepspeed/deepspeed_trainer.py | 204 ++++++++---------- allennlp/training/deepspeed/engine_adapter.py | 12 +- .../training/deepspeed/optimizers/basic.py | 7 - 3 files changed, 96 insertions(+), 127 deletions(-) diff --git a/allennlp/training/deepspeed/deepspeed_trainer.py b/allennlp/training/deepspeed/deepspeed_trainer.py index 33e70eeb1b5..a43f408980c 100644 --- a/allennlp/training/deepspeed/deepspeed_trainer.py +++ b/allennlp/training/deepspeed/deepspeed_trainer.py @@ -1,8 +1,3 @@ -import logging -from deepspeed.utils import logger as ds_logger -ds_logger.setLevel(logging.WARNING) -ds_logger.propagate = False - import datetime import logging import os @@ -10,19 +5,14 @@ import math import time import traceback +from argparse import Namespace from copy import deepcopy from contextlib import contextmanager from typing import Any, Dict, Iterator, List, Optional, Tuple, Union - -from allennlp.common.util import int_to_device +from dataclasses import dataclass, asdict import torch import torch.distributed as dist -from torch.cuda import amp -from torch.nn.parallel import DistributedDataParallel - -import deepspeed -# from deepspeed.runtime.engine import DeepSpeedEngine from allennlp.common import Lazy, Registrable, Tqdm, Params, FromParams from allennlp.common import util as common_util @@ -39,61 +29,27 @@ from allennlp.training.tensorboard_writer import TensorboardWriter from allennlp.training.trainer import Trainer, BatchCallback, EpochCallback -from allennlp.training.deepspeed.engine_adapter import AllennlpDeepSpeedEngineAdapter as DeepSpeedEngine +from allennlp.training.deepspeed.engine_adapter import AllennlpDeepSpeedEngineAdapter from allennlp.training.deepspeed.optimizers.zero_optimization import ZeroOptimizer +from pytorch_memlab import LineProfiler + logger = logging.getLogger(__name__) -JsonDict = Dict[str, Any] +@dataclass +class DeepspeedFP16Config(FromParams): + enabled: bool = False + loss_scale: float = 0. + initial_scale_power: int = 32 + loss_scale_window: int = 1000 + hysteresis: int = 2 + min_loss_scale: float = 1. + +@dataclass +class DeepspeedAMPConfig(FromParams): + enabled: bool = False + opt_level: str = "O1" -class DeepspeedConfig(FromParams): - def __init__( - self, - optimizer: Lazy[Optimizer], # JsonDict, - fp16: JsonDict = {'enabled': False}, - amp: JsonDict = {'enabled': False}, - zero_optimization: Union[bool, Dict] = False, - zero_optimizer: Lazy[ZeroOptimizer] = None, - zero_allow_untested_optimizer: bool = True, - wall_clock_breakdown: bool = False - ): - self.optimizer = optimizer - self.fp16 = fp16 - self.amp = amp - self.zero_optimization = zero_optimization - self.zero_allow_untested_optimizer = zero_allow_untested_optimizer - self.wall_clock_breakdown = wall_clock_breakdown - self._zero_optim = zero_optimizer - - def launch( - self, - model: torch.nn.Module, - local_rank: int, - batch_size: int, - gradient_accumulation_steps: int, - **kwargs - ): - from argparse import Namespace - args = Namespace(deepspeed_config=None, deepspeed=True, local_rank=local_rank) - - optimizer = self.optimizer.construct(model_parameters=model.parameters()) - del self.optimizer - - zero_optim = self._zero_optim - del self._zero_optim - # del self.zero_optimization - - config = dict(**vars(self), train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) - ds = DeepSpeedEngine( - args=args, - model=model, - optimizer=optimizer, - zero_optimizer=zero_optim, - model_parameters=model.parameters(), - dist_init_required=False, - config_params=config - ) - return ds, ds.optimizer @Trainer.register("deepspeed", constructor="from_partial_objects") class DeepspeedTrainer(Trainer): @@ -101,7 +57,9 @@ def __init__( self, model: Model, data_loader: DataLoader, - deepspeed_config: DeepspeedConfig, + # deepspeed_config: DeepspeedConfig, + deepspeed_engine: AllennlpDeepSpeedEngineAdapter, + deepspeed_optimizer: ZeroOptimizer, patience: Optional[int] = None, validation_metric: str = "-loss", validation_data_loader: DataLoader = None, @@ -116,8 +74,7 @@ def __init__( distributed: bool = False, local_rank: int = 0, world_size: int = 1, - num_gradient_accumulation_steps: int = 1, - use_amp: bool = False, + num_gradient_accumulation_steps: int = 1 ) -> None: super().__init__(serialization_dir, cuda_device, distributed, local_rank, world_size) @@ -125,6 +82,14 @@ def __init__( # not already on the GPU then the optimizer is going to be wrong. self.model = model + self.model_engine = deepspeed_engine + self.optimizer = deepspeed_optimizer + + if hasattr(self.model_engine, 'timers'): + def mute_log(*args, **kwargs): + pass + self.model_engine.timers.log = mute_log + self.data_loader = data_loader self._validation_data_loader = validation_data_loader @@ -170,33 +135,18 @@ def __init__( self._num_gradient_accumulation_steps = num_gradient_accumulation_steps self._pytorch_model = self.model - - self._ds_config = deepspeed_config - self.model_engine, self.optimizer, *_ = self._ds_config.launch( - self.model, - local_rank, - self.data_loader.batch_size, - num_gradient_accumulation_steps - ) - - if hasattr(self.model_engine, 'timers'): - def mute_log(*args, **kwargs): - pass - self.model_engine.timers.log = mute_log def batch_outputs(self, batch: TensorDict, for_training: bool) -> Dict[str, torch.Tensor]: """ Does a forward pass on the given batch and returns the output dictionary that the model returns, after adding any specified regularization penalty to the loss (if training). """ - # batch = nn_util.move_to_device(batch, self.cuda_device) batch = nn_util.move_to_device(batch, self.model_engine.device) - # with profiler.profile(use_cuda=True, profile_memory=True) as prof: - #with profiler.record_function("forward"): - # with Profiler() as profiler: output_dict = self.model_engine(**batch) - # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) - # print(profiler.output_text(unicode=True, color=True, show_all=True, timeline=True)) + + # for worker in range(2): + # logger.info(torch.cuda.memory_summary(worker)) + # logger.info(torch.cuda.memory_summary(self.model_engine.device)) if for_training: try: @@ -273,7 +223,10 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: self._batch_num_total += 1 batch_num_total = self._batch_num_total + # with LineProfiler(self.batch_outputs, target_gpu=self.model_engine.device) as prof: batch_outputs = self.batch_outputs(batch, for_training=True) + # prof.print_stats() # display() + loss = batch_outputs.get("loss") reg_loss = batch_outputs.get("reg_loss") if torch.isnan(loss): @@ -332,7 +285,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: param_updates, ) - self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) + # self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) for callback in self._batch_callbacks: callback( @@ -581,7 +534,7 @@ def train(self) -> Dict[str, Any]: ) - if self._master: + if False and self._master: self._checkpointer.save_checkpoint( epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() ) @@ -610,9 +563,9 @@ def train(self) -> Dict[str, Any]: self._tensorboard.close() # Load the best model state before returning - best_model_state = self._checkpointer.best_model_state() - if best_model_state: - self.model.load_state_dict(best_model_state) + # best_model_state = self._checkpointer.best_model_state() + # if best_model_state: + # self.model.load_state_dict(best_model_state) return metrics @@ -694,7 +647,11 @@ def from_partial_objects( model: Model, serialization_dir: str, data_loader: DataLoader, - deepspeed_config: DeepspeedConfig, + fp16: DeepspeedFP16Config = DeepspeedFP16Config(), + amp: DeepspeedAMPConfig = DeepspeedAMPConfig(), + zero_allow_untested_optimizer: bool = True, + wall_clock_breakdown: bool = False, + validation_data_loader: DataLoader = None, local_rank: int = 0, patience: int = None, @@ -706,38 +663,13 @@ def from_partial_objects( num_gradient_accumulation_steps: int = 1, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, + zero_optimizer: Lazy[ZeroOptimizer] = None, tensorboard_writer: Lazy[TensorboardWriter] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = None, batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, ) -> "Trainer": - """ - This method exists so that we can have a documented method to construct this class using - `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this - method. - The reason we can't just use `__init__` with `FromParams` here is because there are - sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type - annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to - have the parameters from the `Model` before it's constructed, and the `Schedulers` need to - have the `Optimizer`. Because of this, the typical way we construct things `FromParams` - doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. - If you're not using `FromParams`, you can just construct these arguments in the right order - yourself in your code and call the constructor directly. - """ - if cuda_device is None: - from torch import cuda - - if cuda.device_count() > 0: - cuda_device = 0 - else: - cuda_device = -1 - - check_for_gpu(cuda_device) - if cuda_device >= 0: - # Moving model to GPU here so that the optimizer state gets constructed on - # the right device. - model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): @@ -752,10 +684,31 @@ def from_partial_objects( checkpointer_ = checkpointer.construct() or Checkpointer(serialization_dir) tensorboard_writer_ = tensorboard_writer.construct() or TensorboardWriter(serialization_dir) + optim_ = optimizer.construct(model_parameters=model.parameters()) # deepspeed only wants params, not names + model_engine = AllennlpDeepSpeedEngineAdapter( + args=Namespace(deepspeed_config=None, deepspeed=True, local_rank=local_rank), + model=model, + optimizer=optim_, + zero_optimizer=zero_optimizer, + model_parameters=model.parameters(), + dist_init_required=False, + config_params=dict( + fp=asdict(fp16), + amp=asdict(amp), + train_batch_size=data_loader.batch_size, + gradient_accumulation_steps=num_gradient_accumulation_steps, + zero_allow_untested_optimizer=zero_allow_untested_optimizer, + wall_clock_breakdown=wall_clock_breakdown, + tensorboard_enabled=False + ) + ) + ds_optimizer = model_engine.optimizer + return cls( model, data_loader, - deepspeed_config=deepspeed_config, + deepspeed_engine=model_engine, + deepspeed_optimizer=ds_optimizer, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_data_loader, @@ -771,4 +724,19 @@ def from_partial_objects( local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, - ) \ No newline at end of file + ) + + +def launch_ds( + model: torch.nn.Module, + optimizer, + zero_optim, + fp16: DeepspeedFP16Config, + amp: DeepspeedAMPConfig, + local_rank: int, + batch_size: int, + gradient_accumulation_steps: int, + zero_allow_untested_optimizer: bool, + wall_clock_breakdown: bool +): + return ds, ds.optimizer \ No newline at end of file diff --git a/allennlp/training/deepspeed/engine_adapter.py b/allennlp/training/deepspeed/engine_adapter.py index ec2b4ecf7e8..81fd818146e 100644 --- a/allennlp/training/deepspeed/engine_adapter.py +++ b/allennlp/training/deepspeed/engine_adapter.py @@ -2,6 +2,11 @@ Copyright 2019 The Microsoft DeepSpeed Team ''' +import logging +from deepspeed.utils import logger as ds_logger +ds_logger.setLevel(logging.WARNING) +ds_logger.propagate = False + import os import torch import warnings @@ -42,7 +47,6 @@ from allennlp.common import Lazy, FromParams from allennlp.training.deepspeed.optimizers.zero_optimization import ZeroOptimizer -# from allennlp.training.deepspeed.optimizers.fp16 import DeepspeedFP16Optimizer from allennlp.training.deepspeed.optimizers.basic import * @@ -139,6 +143,10 @@ def __init__(self, def dynamic_loss_scale(self): return self.loss_scale == 0 + @property + def postscale_gradients(self): + return not self._config.prescale_gradients + def _configure_lr_scheduler(self, client_lr_scheduler): # First check for scheduler in json configuration lr_scheduler = self._scheduler_from_config(self.optimizer) @@ -246,7 +254,7 @@ def _configure_fp16_optimizer(self, optimizer): verbose=False ) - if not self.dynamic_loss_scale(): + if not self.dynamic_loss_scale: return FP16_Optimizer(**defaults, static_loss_scale=self.loss_scale) defaults.update(dict( diff --git a/allennlp/training/deepspeed/optimizers/basic.py b/allennlp/training/deepspeed/optimizers/basic.py index d33ab90bad9..a9b94a27cfb 100644 --- a/allennlp/training/deepspeed/optimizers/basic.py +++ b/allennlp/training/deepspeed/optimizers/basic.py @@ -33,13 +33,6 @@ @Optimizer.register('fused_adam', constructor='construct') class DeepspeedFusedAdamOptimizer(Optimizer, FusedAdam): - # def __init__( - # self, - # model_parameters, - # **kwargs - # ): - # super().__init__(model_parameters, **kwargs) - @staticmethod def construct(model_parameters, **kwargs): return FusedAdam(model_parameters, **kwargs) From 0a74573f443a278223db7470764deff7df306d3c Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Fri, 30 Oct 2020 23:05:24 -0400 Subject: [PATCH 07/20] functioning / cleaner prototype --- allennlp/training/deepspeed/__init__.py | 12 +- allennlp/training/deepspeed/config.py | 62 ++ allennlp/training/deepspeed/engine_adapter.py | 570 ------------------ allennlp/training/deepspeed/optimizers.py | 87 +++ .../training/deepspeed/optimizers/__init__.py | 0 .../training/deepspeed/optimizers/basic.py | 44 -- .../training/deepspeed/optimizers/fp16.py | 53 -- .../deepspeed/optimizers/zero_optimization.py | 127 ---- .../deepspeed/sparse_transformer_embedder.py | 10 + .../{deepspeed_trainer.py => trainer.py} | 117 ++-- allennlp/training/deepspeed/utils.py | 39 ++ 11 files changed, 243 insertions(+), 878 deletions(-) create mode 100644 allennlp/training/deepspeed/config.py delete mode 100644 allennlp/training/deepspeed/engine_adapter.py create mode 100644 allennlp/training/deepspeed/optimizers.py delete mode 100644 allennlp/training/deepspeed/optimizers/__init__.py delete mode 100644 allennlp/training/deepspeed/optimizers/basic.py delete mode 100644 allennlp/training/deepspeed/optimizers/fp16.py delete mode 100644 allennlp/training/deepspeed/optimizers/zero_optimization.py create mode 100644 allennlp/training/deepspeed/sparse_transformer_embedder.py rename allennlp/training/deepspeed/{deepspeed_trainer.py => trainer.py} (89%) create mode 100644 allennlp/training/deepspeed/utils.py diff --git a/allennlp/training/deepspeed/__init__.py b/allennlp/training/deepspeed/__init__.py index 682238521aa..3a3637f660d 100644 --- a/allennlp/training/deepspeed/__init__.py +++ b/allennlp/training/deepspeed/__init__.py @@ -1 +1,11 @@ -from allennlp.training.deepspeed.deepspeed_trainer import DeepspeedTrainer \ No newline at end of file +from allennlp.training.deepspeed.trainer import DeepspeedTrainer +from allennlp.training.deepspeed.optimizers import ( + FusedAdamOptimizer, + DeepspeedCPUAdamOptimizer, + FusedLambOptimizer +) + +try: + from allennlp.training.deepspeed.sparse_transformer_embedder import SparseTransformerEmbedder +except ImportError: + pass \ No newline at end of file diff --git a/allennlp/training/deepspeed/config.py b/allennlp/training/deepspeed/config.py new file mode 100644 index 00000000000..35a08bb55e1 --- /dev/null +++ b/allennlp/training/deepspeed/config.py @@ -0,0 +1,62 @@ +from typing import Dict, Any +from enum import IntEnum +from allennlp.common import FromParams +from dataclasses import dataclass, asdict + + +@dataclass +class DeepspeedFP16Config(FromParams): + enabled: bool = True + loss_scale: float = 0. + initial_scale_power: int = 32 + loss_scale_window: int = 1000 + hysteresis: int = 2 + min_loss_scale: float = 1. + +@dataclass +class DeepspeedAMPConfig(FromParams): + enabled: bool = False + opt_level: str = "O1" + +@dataclass +class DeepspeedOptimizerConfig(FromParams): + type: str + params: Dict[str, Any] + +class DeepspeedZeROStage(IntEnum): + DISABLED = 0 + OPTIMIZER = 1 + GRADIENT = 2 + +@dataclass +class DeepspeedZeROConfig(FromParams): + stage: DeepspeedZeROStage = DeepspeedZeROStage.GRADIENT + allgather_partitions: bool = True + allgather_bucket_size: int = 500000000 + overlap_comm: bool = False + reduce_scatter: bool = True + reduce_bucket_size: int = 500000000 + contiguous_gradients: bool = False + cpu_offload: bool = False + + +@dataclass +class DeepspeedConfig(FromParams): + zero_optimization: DeepspeedZeROConfig + fp16: DeepspeedFP16Config + amp: DeepspeedAMPConfig = DeepspeedAMPConfig() + optimizer: DeepspeedOptimizerConfig = None + + zero_allow_untested_optimizer: bool = True + wall_clock_breakdown: bool = False + + def to_dict(self): + return asdict(self) + + +@dataclass +class DeepspeedArgs(FromParams): + local_rank: int + deepspeed: bool = True + deepspeed_mpi: bool = False + deepspeed_config: str = None \ No newline at end of file diff --git a/allennlp/training/deepspeed/engine_adapter.py b/allennlp/training/deepspeed/engine_adapter.py deleted file mode 100644 index 81fd818146e..00000000000 --- a/allennlp/training/deepspeed/engine_adapter.py +++ /dev/null @@ -1,570 +0,0 @@ -''' -Copyright 2019 The Microsoft DeepSpeed Team -''' - -import logging -from deepspeed.utils import logger as ds_logger -ds_logger.setLevel(logging.WARNING) -ds_logger.propagate = False - -import os -import torch -import warnings -import torch.distributed as dist - -import apex -from apex.optimizers import ( - FusedAdam, - FusedLAMB -) -from torch import nn -from torch.distributed.distributed_c10d import _get_global_rank - -from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer -from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1 -from deepspeed.runtime.zero.utils import is_zero_supported_optimizer -from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer -from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer -from deepspeed.runtime.fp16.onebit_adam import OnebitAdam -from deepspeed.runtime.config import DeepSpeedConfig, \ - ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, DEEPSPEED_ADAM, DEEPSPEED_OPTIMIZERS -from deepspeed.runtime.dataloader import DeepSpeedDataLoader -from deepspeed.runtime.constants import \ - ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \ - TORCH_DISTRIBUTED_DEFAULT_PORT -from deepspeed.runtime.zero.constants import \ - ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS -from deepspeed.runtime.csr_tensor import CSRTensor -import deepspeed.runtime.lr_schedules as lr_schedules -from deepspeed.utils import logger, log_dist -from deepspeed.runtime.engine import ( - _initialize_parameter_parallel_groups, - split_half_float_double_csr, - flatten, - unflatten, - MEMORY_OPT_ALLREDUCE_SIZE -) - -from allennlp.common import Lazy, FromParams -from allennlp.training.deepspeed.optimizers.zero_optimization import ZeroOptimizer -from allennlp.training.deepspeed.optimizers.basic import * - - -class DummyTimer: - class Timer: - def __init__(self, name): - pass - - def start(self): - pass - - def stop(self): - pass - - def reset(self): - pass - - def __init__(self): - self.timers = {} - - def __call__(self, name): - if name not in self.timers: - self.timers[name] = self.Timer(name) - return self.timers[name] - - def log(self, *args, **kwargs): - pass - - - - -class AllennlpDeepSpeedEngineAdapter(FromParams, nn.Module): - r"""DeepSpeed engine for training. - """ - def __init__(self, - args, - model, - optimizer=None, - zero_optimizer: Lazy[ZeroOptimizer] = None, - model_parameters=None, - lr_scheduler=None, - mpu=None, - dist_init_required=None, - config_params=None - ): - super().__init__() - self.zero_optimizer = zero_optimizer - - self.client_optimizer = optimizer - self.client_model_parameters = model_parameters - self.client_lr_scheduler = lr_scheduler - self.mpu = mpu - self.data_parallel_group = None - self.micro_steps = 0 - self.skipped_steps = 0 - self.gradient_average = True - self.warn_unscaled_loss = True - self.config_params = config_params - self.enable_backward_allreduce = True - - if dist_init_required is None: - dist_init_required = not dist.is_initialized() - - self.dist_backend = "nccl" - if dist_init_required: - if not dist.is_initialized(): - logger.info("Initializing torch distributed with backend: {}".format( - self.dist_backend)) - dist.init_process_group(backend=self.dist_backend) - else: - logger.warning( - "Was given dist_init_required=True but detected that torch" - "distributed was already initialized, cannot initialize twice.") - - self._configure_with_arguments(args, mpu) - - self._init_distributed(dist_init_required) - - # Configure distributed model - self._configure_distributed_model(model) - - # Configure optimizer and scheduler - self.optimizer = self._configure_optimizer(optimizer, model_parameters) - self._configure_lr_scheduler(lr_scheduler) - - # Bookkeeping for csr support - self.csr_tensor_module_names = set() - if self.sparse_gradients_enabled: - for name, module in self.module.named_modules(): - if isinstance(module, torch.nn.Embedding): - self.csr_tensor_module_names.add(name + ".weight") - - @property - def dynamic_loss_scale(self): - return self.loss_scale == 0 - - @property - def postscale_gradients(self): - return not self._config.prescale_gradients - - def _configure_lr_scheduler(self, client_lr_scheduler): - # First check for scheduler in json configuration - lr_scheduler = self._scheduler_from_config(self.optimizer) - if lr_scheduler: - self.lr_scheduler = lr_scheduler - else: - self.lr_scheduler = client_lr_scheduler - - def _scheduler_from_config(self, optimizer): - scheduler_name = self.scheduler_name - if scheduler_name is not None: - if hasattr(lr_schedules, scheduler_name): - scheduler = getattr(lr_schedules, scheduler_name) - else: - assert hasattr(torch.optim.lr_scheduler, scheduler_name), \ - f"DeepSpeed does not recognize LR scheduler {scheduler_name}" - - scheduler = getattr(torch.optim.lr_scheduler, scheduler_name) - - instantiated_scheduler = scheduler(optimizer, **self.scheduler_params) - return instantiated_scheduler - else: - return None - - def _init_distributed(self, dist_init_required): - if self.local_rank >= 0: - torch.cuda.set_device(self.local_rank) - self.device = torch.device("cuda", self.local_rank) - self.world_size = dist.get_world_size() - self.global_rank = dist.get_rank() - else: - self.world_size = 1 - self.global_rank = 0 - self.device = torch.device("cuda") - - # Configure based on command line arguments - def _configure_with_arguments(self, args, mpu): - self.local_rank = args.local_rank if hasattr(args, 'local_rank') else 0 - self._config = DeepSpeedConfig(args.deepspeed_config, - mpu, - param_dict=self.config_params) - for k, v in vars(self._config).items(): - setattr(self, k, v) - - - def _is_supported_optimizer(self, optimizer_name): - return optimizer_name in DEEPSPEED_OPTIMIZERS or \ - getattr(torch.optim, optimizer_name, None) is not None - - def _broadcast_model(self): - for p in self.module.parameters(): - if torch.is_tensor(p): - dist.broadcast(p, - self.broadcast_src_rank, - group=self.data_parallel_group) - - def _configure_distributed_model(self, model): - self.module = model - if self.fp16_enabled: - self.module.half() - self.module.to(self.device) - - if self.mpu is None: - self.data_parallel_group = _initialize_parameter_parallel_groups() - self.dp_world_size = dist.get_world_size() - self.mp_world_size = 1 - self.broadcast_src_rank = 0 - else: - self.data_parallel_group = self.mpu.get_data_parallel_group() - self.dp_world_size = self.mpu.get_data_parallel_world_size() - self.mp_world_size = self.mpu.get_model_parallel_world_size() - self.broadcast_src_rank = _get_global_rank( - self.mpu.get_data_parallel_group(), - 0 - ) - - self._broadcast_model() - - def _configure_optimizer(self, client_optimizer, model_parameters): - basic_optimizer = client_optimizer - - if self.zero_enabled: #zero_optimization: # self.zero_optimizer or - if not is_zero_supported_optimizer(basic_optimizer): - assert self.zero_allow_untested_optimizer, \ - 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.' - - if self.global_rank == 0: - logger.warning("**** You are using ZeRO with an untested optimizer, proceed with caution *****") - - return self._configure_zero_optimizer(basic_optimizer) - - if self.fp16_enabled: - return self._configure_fp16_optimizer(basic_optimizer) - - return basic_optimizer - - - def _configure_fp16_optimizer(self, optimizer): - defaults = dict( - init_optimizer=optimizer, - mpu=self.mpu, - clip_grad=self.gradient_clipping, - fused_adam_legacy=self.optimizer_legacy_fusion, - timers=None, - verbose=False - ) - - if not self.dynamic_loss_scale: - return FP16_Optimizer(**defaults, static_loss_scale=self.loss_scale) - - defaults.update(dict( - dynamic_loss_scale=True, - dynamic_loss_args=self.dynamic_loss_scale_args, - )) - - if isinstance(optimizer, (FusedAdam, OnebitAdam)): - extras = dict(initial_dynamic_scale=self.initial_dynamic_scale) - else: - extras = dict(fused_lamb_legacy=isinstance(optimizer, FusedLAMB)) - optimizer = FP16_Optimizer(**defaults, **extras) - return optimizer - - def _configure_zero_optimizer(self, optimizer): - optimizer = self.zero_optimizer.construct( - init_optimizer=optimizer, - dp_process_group=self.data_parallel_group, - mpu=self.mpu - ) - assert not (isinstance(optimizer, FP16_DeepSpeedZeroOptimizer_Stage1) and not self.zero_reduce_scatter), 'Stage 1 only supports reduce scatter mode' - return optimizer - - def train(self): - r""" - """ - - self.warn_unscaled_loss = True - self.module.train() - - def eval(self): - r""" - """ - - self.warn_unscaled_loss = True - self.module.train(False) - - def _scale_loss(self, prescaled_loss): - if isinstance(prescaled_loss, torch.Tensor): - scaled_loss = prescaled_loss / self.gradient_accumulation_steps - elif isinstance(prescaled_loss, tuple) or isinstance(prescaled_loss, list): - scaled_loss = [] - for l in prescaled_loss: - if isinstance(l, torch.Tensor): - scaled_loss.append(l / self.gradient_accumulation_steps) - else: - scaled_loss.append(l) - else: - scaled_loss = prescaled_loss - if self.warn_unscaled_loss: - logger.warning( - f'DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}' - ) - self.warn_unscaled_loss = False - - return scaled_loss - - def forward(self, *inputs, **kwargs): - r"""Execute forward propagation - - Arguments: - *inputs: Variable length input list - **kwargs: variable length keyword arguments - """ - loss = self.module(*inputs, **kwargs) - return loss - - def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): - #Zero stage 2 communicates during non gradient accumulation boundaries as well - if self.zero_optimization_stage >= ZERO_OPTIMIZATION_GRADIENTS: - self.optimizer.overlapping_partition_gradients_reduce_epilogue() - - #Communicate only at gradient accumulation boundaries - elif self.is_gradient_accumulation_boundary: - if self.zero_optimization_stage == ZERO_OPTIMIZATION_OPTIMIZER_STATES: - assert self.zero_reduce_scatter - self.optimizer.reduce_scatter_gradients( - postscale_gradients=self.postscale_gradients, - gradient_predivide_factor=self.gradient_predivide_factor, - gradient_average=self.gradient_average) - else: - self.buffered_allreduce_fallback(elements_per_buffer=bucket_size) - - def backward(self, loss, allreduce_gradients=True, release_loss=False): - r"""Execute backward pass on the loss - - Arguments: - loss: Torch tensor on which to execute backward propagation - allreduce_gradients: If this is False, then gradient averaging will be skipped. Default is True. - """ - - # scale loss w.r.t. gradient accumulation if needed - if self.gradient_accumulation_steps > 1: - loss = self._scale_loss(loss.float()) - - assert self.optimizer is not None, "must provide optimizer during " \ - "init in order to use backward" - - if self.zero_enabled: #zero_optimization: - self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary - self.optimizer.backward(loss) - elif self.fp16_enabled: - self.optimizer.backward(loss) - else: - loss.backward() - - if allreduce_gradients and self.enable_backward_allreduce: - self.allreduce_gradients() - - return loss - - @property - def is_gradient_accumulation_boundary(self): - """Query whether the current micro-batch is at the boundary of - gradient accumulation, and thus will trigger gradient reductions and - an optimizer step. - - Returns: - bool: if the current step is a gradient accumulation boundary. - """ - return (self.micro_steps + 1) % \ - self.gradient_accumulation_steps == 0 - - def zero_grad(self): - """ - Zero parameter grads. - """ - for param_name, param in self.module.named_parameters(): - param.grad = None - - def clip_fp32_gradients(self): - torch.nn.utils.clip_grad_norm_(parameters=self.module.parameters(), - max_norm=self.gradient_clipping) - - def _take_model_step(self): - if self.gradient_clipping > 0.0 and not self.fp16_enabled: - self.clip_fp32_gradients() - self.optimizer.step() - - #zero grad in basic optimizer could be unreliable and may not exhibit - #the behaviour that we want - if not self.zero_enabled and not self.fp16_enabled: - self.zero_grad() - else: - self.optimizer.zero_grad() - - # Check overlow here since in DS fp16 optimizer, the overflow is updated in above step() function. - overflow = False - if hasattr(self.optimizer, 'overflow'): - overflow = self.optimizer.overflow - - if overflow: - self.skipped_steps += 1 - else: - if self.lr_scheduler is not None: - self.lr_scheduler.step() - - def step(self): - r"""Execute the weight update step after forward and backward propagation - on effective_train_batch. - """ - - assert self.optimizer is not None, "must provide optimizer during " \ - "init in order to use step" - - # Update the model when we reach gradient accumulation boundaries - if self.is_gradient_accumulation_boundary: - self._take_model_step() - - self.micro_steps += 1 - - def _get_optimizer_param(self, param_name): - result = [] - if not self.optimizer: - return result - for group in self.optimizer.param_groups: - if param_name in group: - result.append(group[param_name]) - else: - result.append(0.0) - return result - - - def allreduce_bucket(self, bucket): - tensor = flatten(bucket) - - tensor_to_allreduce = tensor - - if self.allreduce_always_fp32: - tensor_to_allreduce = tensor.float() - - if self.postscale_gradients: - if self.gradient_predivide_factor != 1.0: - tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor) - - dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group) - - if self.gradient_average: - if self.gradient_predivide_factor != self.dp_world_size: - tensor_to_allreduce.mul_(self.gradient_predivide_factor / self.dp_world_size) - else: - tensor_to_allreduce.div_(self.dp_world_size) - dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group) - - if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce: - tensor.copy_(tensor_to_allreduce) - - return tensor - - def allreduce_and_copy(self, small_bucket): - allreduced = self.allreduce_bucket(small_bucket) - for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)): - buf.copy_(synced) - - def allreduce_no_retain(self, bucket, numel_per_bucket=500000000): - small_bucket = [] - numel = 0 - for tensor in bucket: - small_bucket.append(tensor) - numel = numel + tensor.numel() - if numel > numel_per_bucket: - self.allreduce_and_copy(small_bucket) - small_bucket = [] - numel = 0 - if len(small_bucket) > 0: - self.allreduce_and_copy(small_bucket) - - def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000): - grads = [] - for param_name, param in self.module.named_parameters(): - if param.grad is None: - # In cases where there is an imbalance of empty grads across - # ranks we must create empty grads, this will ensure that every - # rank is reducing the same size. In some cases it may make - # sense in the future to support the ability to average not - # w.r.t. world size but with a different value. - param.grad = torch.zeros(param.size(), - dtype=param.dtype, - device=param.device) - grads.append(param.grad.data) - else: - grad_data = param.grad.data - if self.sparse_gradients_enabled and param_name in self.csr_tensor_module_names: - grads.append(CSRTensor(grad_data)) - else: - grads.append(grad_data) - - split_buckets = split_half_float_double_csr(grads) - - for i, bucket_tuple in enumerate(split_buckets): - bucket_type, bucket = bucket_tuple - if bucket_type == CSRTensor.type(): - self.csr_allreduce_no_retain(bucket) - else: - self.allreduce_no_retain(bucket, numel_per_bucket=elements_per_buffer) - - def csr_allreduce_no_retain(self, bucket): - allreduced_csrs = self.csr_allreduce_bucket(bucket) - # Densify csr tensor and copy back to original location - for csr in allreduced_csrs: - dense_tensor = csr.to_dense() - csr.orig_dense_tensor.copy_(dense_tensor) - - def csr_allreduce_bucket(self, bucket): - csr_list = [] - for csr in bucket: - csr_list.append(self.csr_allreduce(csr)) - return csr_list - - def csr_allreduce(self, csr): - # Pre-divide for fp16 stability - csr.values.div_(self.dp_world_size) - - indices_device_list = self.csr_all_gather(csr.indices) - values_device_list = self.csr_all_gather(csr.values) - - csr.indices = torch.cat(indices_device_list) - csr.values = torch.cat(values_device_list) - return csr - - def csr_all_gather(self, value): - my_size = torch.LongTensor([value.size()[0]]).to(self.device) - all_sizes = self.all_gather_scalar(my_size) - max_size = torch.cat(all_sizes).max() - fill_size = (max_size - my_size) - - assert value.dim() in [1, 2] - if value.dim() == 1: - if fill_size > 0: - value = torch.cat([value, value.new_zeros(fill_size)]) - tensor_list = [value.new_zeros(max_size) for _ in range(self.dp_world_size)] - else: - if fill_size > 0: - value = torch.cat([value, value.new_zeros(fill_size, value.size()[1])]) - tensor_list = [ - value.new_zeros(max_size, - value.size()[1]) for _ in range(self.dp_world_size) - ] - - dist.all_gather(tensor_list, value, group=self.data_parallel_group) - tensors = [] - for dev_idx, t in enumerate(tensor_list): - size = all_sizes[dev_idx][0] - tensors.append( - t.index_select(0, - torch.LongTensor(range(size)).to(self.device))) - - return tensors - - def all_gather_scalar(self, value): - tensor_list = [value.new_zeros(value.size()) for _ in range(self.dp_world_size)] - dist.all_gather(tensor_list, value, group=self.data_parallel_group) - return tensor_list \ No newline at end of file diff --git a/allennlp/training/deepspeed/optimizers.py b/allennlp/training/deepspeed/optimizers.py new file mode 100644 index 00000000000..7eaa229a480 --- /dev/null +++ b/allennlp/training/deepspeed/optimizers.py @@ -0,0 +1,87 @@ +from typing import List, Tuple, Dict, Any + +import torch + +from apex.optimizers.fused_adam import FusedAdam +from deepspeed.ops.adam import DeepSpeedCPUAdam +from deepspeed.ops.lamb import FusedLamb +from deepspeed.runtime.fp16.onebit_adam import OnebitAdam + +from allennlp.training.optimizers import Optimizer, make_parameter_groups + +@Optimizer.register("fused_adam") +class FusedAdamOptimizer(Optimizer, FusedAdam): + def __init__( + self, + model_parameters: List[Tuple[str, torch.nn.Parameter]], + parameter_groups: List[Tuple[List[str], Dict[str, Any]]] = None, + lr: float = 0.001, + betas: Tuple[float, float] = (0.9, 0.999), + eps: float = 1e-08, + weight_decay: float = 0.0, + amsgrad: bool = False, + bias_correction: bool =True, + adam_w_mode: bool = True, + set_grad_none: bool = True, + ): + super().__init__( + params=make_parameter_groups(model_parameters, parameter_groups), + lr=lr, + betas=betas, + eps=eps, + weight_decay=weight_decay, + amsgrad=amsgrad, + bias_correction=bias_correction, + adam_w_mode=adam_w_mode, + set_grad_none=set_grad_none, + ) + +# This does not currently work +@Optimizer.register("cpu_adam") +class DeepspeedCPUAdamOptimizer(Optimizer, DeepSpeedCPUAdam): + def __init__( + self, + model_parameters: List[Tuple[str, torch.nn.Parameter]], + parameter_groups: List[Tuple[List[str], Dict[str, Any]]] = None, + lr: float = 0.001, + betas: Tuple[float, float] = (0.9, 0.999), + eps: float = 1e-08, + weight_decay: float = 0.0, + amsgrad: bool = False, + ): + super().__init__( + model_params=make_parameter_groups(model_parameters, parameter_groups), + lr=lr, + betas=betas, + eps=eps, + weight_decay=weight_decay, + amsgrad=amsgrad + ) + +@Optimizer.register("fused_lamb") +class FusedLambOptimizer(Optimizer, FusedLamb): + def __init__( + self, + model_parameters: List[Tuple[str, torch.nn.Parameter]], + parameter_groups: List[Tuple[List[str], Dict[str, Any]]] = None, + lr: float = 0.001, + betas: Tuple[float, float] = (0.9, 0.999), + eps: float = 1e-08, + eps_inside_sqrt: bool = False, + weight_decay: float = 0.0, + amsgrad: bool = False, + max_grad_norm: float = 0., + max_coeff: float = 10.0, + min_coeff: float = 0.01 + ): + super().__init__( + params=make_parameter_groups(model_parameters, parameter_groups), + lr=lr, + betas=betas, + eps=eps, + weight_decay=weight_decay, + amsgrad=amsgrad, + max_grad_norm=max_grad_norm, + max_coeff=max_coeff, + min_coeff=min_coeff, + ) \ No newline at end of file diff --git a/allennlp/training/deepspeed/optimizers/__init__.py b/allennlp/training/deepspeed/optimizers/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/allennlp/training/deepspeed/optimizers/basic.py b/allennlp/training/deepspeed/optimizers/basic.py deleted file mode 100644 index a9b94a27cfb..00000000000 --- a/allennlp/training/deepspeed/optimizers/basic.py +++ /dev/null @@ -1,44 +0,0 @@ -from deepspeed.runtime.zero.utils import is_zero_supported_optimizer -from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer -from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer -from deepspeed.runtime.config import ( - DeepSpeedConfig, - ADAM_OPTIMIZER, - LAMB_OPTIMIZER, - ONEBIT_ADAM_OPTIMIZER, - DEEPSPEED_ADAM, - DEEPSPEED_OPTIMIZERS -) - -from apex.optimizers.fused_adam import FusedAdam -from deepspeed.ops.adam import DeepSpeedCPUAdam -from deepspeed.ops.lamb import FusedLamb - -# from allennlp.common import Registrable -from allennlp.training.optimizers import Optimizer - -# class DeepspeedOptimizer(Registrable): -# default_implementation = "fused_adam" - -# DeepspeedOptimizer.register('adam_cpu')(FP16_DeepSpeedZeroOptimizer_Stage1) -# DeepspeedOptimizer.register('fused_adam')(FusedAdam) -# DeepspeedOptimizer.register('deepspeed_adam')(DeepSpeedCPUAdam) -# DeepspeedOptimizer.register('one_bit_adam') -# DeepspeedOptimizer.register('lamb') - -# Optimizer.register('adam_cpu')(FP16_DeepSpeedZeroOptimizer_Stage1) -# Optimizer.register('fused_adam')(FusedAdam) -# Optimizer.register('deepspeed_cpu_adam')(DeepSpeedCPUAdam) -# Optimizer.register('lamb')(FusedLamb) - -@Optimizer.register('fused_adam', constructor='construct') -class DeepspeedFusedAdamOptimizer(Optimizer, FusedAdam): - @staticmethod - def construct(model_parameters, **kwargs): - return FusedAdam(model_parameters, **kwargs) - -try: - from deepspeed.runtime.fp16.onebit_adam import OnebitAdam - Optimizer.register('one_bit_adam')(OnebitAdam) -except ImportError: - pass \ No newline at end of file diff --git a/allennlp/training/deepspeed/optimizers/fp16.py b/allennlp/training/deepspeed/optimizers/fp16.py deleted file mode 100644 index 538ff2218d4..00000000000 --- a/allennlp/training/deepspeed/optimizers/fp16.py +++ /dev/null @@ -1,53 +0,0 @@ -from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer -from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer -from deepspeed.runtime.fp16.onebit_adam import OnebitAdam - -from allennlp.common import Registrable, Lazy -from allennlp.training.optimizers import Optimizer - - -class DeepspeedFP16Optimizer(Registrable): - default_implementation = 'fused' - -@DeepspeedFP16Optimizer.register('fused', constructor='construct') -class DeepspeedFusedFP16Optimizer(DeepspeedFP16Optimizer): - @staticmethod - def construct( - init_optimizer: Optimizer, - mpu=None, - clip_grad=0.0, - static_loss_scale=1.0, - dynamic_loss_scale=False, - initial_dynamic_scale=2**32, - dynamic_loss_args=None, - fused_adam_legacy=False, - timers=None, - verbose=False - ): - if isinstance(optimizer, (apex.optimizers.FusedAdam, OnebitAdam)): - pass - -def _configure_fp16_optimizer(self, optimizer): - initial_dynamic_scale = self.initial_dynamic_scale() - dynamic_loss_args = self.dynamic_loss_scale_args() - - if isinstance(optimizer, apex.optimizers.FusedAdam) or self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER: - defaults['fused_adam_legacy'] = self.optimizer_legacy_fusion() - if self.dynamic_loss_scale(): - defaults.update(dict( - dynamic_loss_scale=True, - initial_dynamic_scale=initial_dynamic_scale, - dynamic_loss_args=dynamic_loss_args, - )) - else: - defaults.update(dict(static_loss_scale=self.loss_scale())) - optimizer = FP16_Optimizer(**defaults) - else: - optimizer = FP16_UnfusedOptimizer( - **defaults, - dynamic_loss_scale=self.dynamic_loss_scale(), - dynamic_loss_args=dynamic_loss_args, - fused_lamb_legacy=isinstance(optimizer, apex.optimizers.FusedLamb) - ) - # raise ValueError(optimizer) - return optimizer \ No newline at end of file diff --git a/allennlp/training/deepspeed/optimizers/zero_optimization.py b/allennlp/training/deepspeed/optimizers/zero_optimization.py deleted file mode 100644 index 8d7dbf8af8e..00000000000 --- a/allennlp/training/deepspeed/optimizers/zero_optimization.py +++ /dev/null @@ -1,127 +0,0 @@ -from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer -from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1 -from deepspeed.runtime.zero.utils import is_zero_supported_optimizer -# from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer -# from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer -# from deepspeed.runtime.config import ( -# DeepSpeedConfig, -# ADAM_OPTIMIZER, -# LAMB_OPTIMIZER, -# ONEBIT_ADAM_OPTIMIZER, -# DEEPSPEED_ADAM, -# DEEPSPEED_OPTIMIZERS -# ) - -from allennlp.common import Registrable, Lazy -from allennlp.training.optimizers import Optimizer - - - -class DummyTimer: - class Timer: - def __init__(self, name): - pass - - def start(self): - pass - - def stop(self): - pass - - def reset(self): - pass - - def __init__(self): - self.timers = {} - - def __call__(self, name): - if name not in self.timers: - self.timers[name] = self.Timer(name) - return self.timers[name] - - def log(self, *args, **kwargs): - pass - - - -class ZeroOptimizer(Registrable): - default_implementation = "stage_2" # "disabled" - -@ZeroOptimizer.register('stage_1', constructor='construct') -class ZeroStage1Optimizer(ZeroOptimizer, FP16_DeepSpeedZeroOptimizer_Stage1): - stage = 1 - - @staticmethod - def construct( - init_optimizer: Optimizer, - dp_process_group=None, - mpu=None, - **kwargs, - ): - return FP16_DeepSpeedZeroOptimizer_Stage1( - init_optimizer, - timers=timers, - dp_process_group=dp_process_group, - mpu=mpu, - **kwargs - ) - - -@ZeroOptimizer.register('stage_2', constructor='construct') -class ZeroStage2Optimizer(ZeroOptimizer, FP16_DeepSpeedZeroOptimizer): - stage = 2 - - @staticmethod - def construct( - init_optimizer: Optimizer, - timers = DummyTimer(), - dp_process_group=None, - mpu=None, - static_loss_scale=1.0, - dynamic_loss_scale=False, - dynamic_loss_args=None, - verbose=False, - contiguous_gradients=True, - reduce_bucket_size=500000000, - allgather_bucket_size=5000000000, - reduce_scatter=True, - overlap_comm=False, - cpu_offload=False, - clip_grad=0.0, - allreduce_always_fp32=False, - postscale_gradients=True, - gradient_predivide_factor=1.0, - gradient_accumulation_steps=1 - ): - return FP16_DeepSpeedZeroOptimizer( - init_optimizer, - timers=timers, - dp_process_group=dp_process_group, - mpu=mpu, - dynamic_loss_scale=dynamic_loss_scale, - dynamic_loss_args=dynamic_loss_args, - verbose=verbose, - contiguous_gradients=contiguous_gradients, - reduce_bucket_size=reduce_bucket_size, - allgather_bucket_size=allgather_bucket_size, - reduce_scatter=reduce_scatter, - overlap_comm=overlap_comm, - cpu_offload=cpu_offload, - clip_grad=clip_grad, - allreduce_always_fp32=allreduce_always_fp32, - postscale_gradients=postscale_gradients, - gradient_predivide_factor=gradient_predivide_factor, - gradient_accumulation_steps=gradient_accumulation_steps - ) - -# @ZeroOptimizer.register('stage_2') -# class ZeroStage2Optimizer(FP16_DeepSpeedZeroOptimizer): -# def __init__(self, init_optimizer=None, timers=DummyTimer(), **kwargs): -# print('!!!!!!!!!!!!!!!') -# print(kwargs) -# assert init_optimizer is not None, init_optimizer -# super().__init__(init_optimizer, timers=timers, **kwargs) - - -# ZeroOptimizer.register('stage_1')(FP16_DeepSpeedZeroOptimizer_Stage1) -# ZeroOptimizer.register('stage_2')(FP16_DeepSpeedZeroOptimizer) \ No newline at end of file diff --git a/allennlp/training/deepspeed/sparse_transformer_embedder.py b/allennlp/training/deepspeed/sparse_transformer_embedder.py new file mode 100644 index 00000000000..2207bdfa794 --- /dev/null +++ b/allennlp/training/deepspeed/sparse_transformer_embedder.py @@ -0,0 +1,10 @@ +from allennlp.modules.token_embedders.token_embedder import TokenEmbedder +from allennlp.modules.token_embedders.pretrained_transformer_embedder import PretrainedTransformerEmbedder + +from deepspeed.ops.sparse_attention.sparse_attention_utils import SparseAttentionUtils + +@TokenEmbedder.register('sparse_transformer') +class SparseTransformerEmbedder(PretrainedTransformerEmbedder): + class __init__(self, **kwargs): + super().__init__(**kwargs) + self.transformer_model = SparseAttentionUtils.replace_model_self_attention_with_sparse_self_attention(self.transformer_model) \ No newline at end of file diff --git a/allennlp/training/deepspeed/deepspeed_trainer.py b/allennlp/training/deepspeed/trainer.py similarity index 89% rename from allennlp/training/deepspeed/deepspeed_trainer.py rename to allennlp/training/deepspeed/trainer.py index a43f408980c..3c071b1a90e 100644 --- a/allennlp/training/deepspeed/deepspeed_trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -1,20 +1,22 @@ import datetime import logging +import math import os import re -import math import time import traceback -from argparse import Namespace -from copy import deepcopy from contextlib import contextmanager from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from dataclasses import dataclass, asdict + +from allennlp.common.util import int_to_device import torch import torch.distributed as dist +from torch.cuda import amp +from torch.nn.parallel import DistributedDataParallel +from torch.nn.utils import clip_grad_norm_ -from allennlp.common import Lazy, Registrable, Tqdm, Params, FromParams +from allennlp.common import Lazy, Registrable, Tqdm from allennlp.common import util as common_util from allennlp.common.checks import ConfigurationError, check_for_gpu from allennlp.data import DataLoader @@ -27,39 +29,21 @@ from allennlp.training.moving_average import MovingAverage from allennlp.training.optimizers import Optimizer from allennlp.training.tensorboard_writer import TensorboardWriter -from allennlp.training.trainer import Trainer, BatchCallback, EpochCallback -from allennlp.training.deepspeed.engine_adapter import AllennlpDeepSpeedEngineAdapter -from allennlp.training.deepspeed.optimizers.zero_optimization import ZeroOptimizer +from allennlp.training.trainer import Trainer, BatchCallback, EpochCallback -from pytorch_memlab import LineProfiler +from allennlp.training.deepspeed.config import DeepspeedConfig, DeepspeedArgs +from allennlp.training.deepspeed.utils import launch_deepspeed logger = logging.getLogger(__name__) -@dataclass -class DeepspeedFP16Config(FromParams): - enabled: bool = False - loss_scale: float = 0. - initial_scale_power: int = 32 - loss_scale_window: int = 1000 - hysteresis: int = 2 - min_loss_scale: float = 1. - -@dataclass -class DeepspeedAMPConfig(FromParams): - enabled: bool = False - opt_level: str = "O1" - - @Trainer.register("deepspeed", constructor="from_partial_objects") class DeepspeedTrainer(Trainer): def __init__( self, model: Model, data_loader: DataLoader, - # deepspeed_config: DeepspeedConfig, - deepspeed_engine: AllennlpDeepSpeedEngineAdapter, - deepspeed_optimizer: ZeroOptimizer, + deepspeed_engine: 'DeepSpeedEngine', patience: Optional[int] = None, validation_metric: str = "-loss", validation_data_loader: DataLoader = None, @@ -77,18 +61,10 @@ def __init__( num_gradient_accumulation_steps: int = 1 ) -> None: super().__init__(serialization_dir, cuda_device, distributed, local_rank, world_size) - - # I am not calling move_to_gpu here, because if the model is - # not already on the GPU then the optimizer is going to be wrong. self.model = model self.model_engine = deepspeed_engine - self.optimizer = deepspeed_optimizer - - if hasattr(self.model_engine, 'timers'): - def mute_log(*args, **kwargs): - pass - self.model_engine.timers.log = mute_log + self.optimizer = deepspeed_engine.optimizer self.data_loader = data_loader self._validation_data_loader = validation_data_loader @@ -143,10 +119,6 @@ def batch_outputs(self, batch: TensorDict, for_training: bool) -> Dict[str, torc """ batch = nn_util.move_to_device(batch, self.model_engine.device) output_dict = self.model_engine(**batch) - - # for worker in range(2): - # logger.info(torch.cuda.memory_summary(worker)) - # logger.info(torch.cuda.memory_summary(self.model_engine.device)) if for_training: try: @@ -285,7 +257,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: param_updates, ) - # self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) + self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) for callback in self._batch_callbacks: callback( @@ -563,9 +535,9 @@ def train(self) -> Dict[str, Any]: self._tensorboard.close() # Load the best model state before returning - # best_model_state = self._checkpointer.best_model_state() - # if best_model_state: - # self.model.load_state_dict(best_model_state) + best_model_state = self._checkpointer.best_model_state() + if best_model_state: + self.model.load_state_dict(best_model_state) return metrics @@ -613,7 +585,7 @@ def _restore_checkpoint(self) -> int: return 0 self.model.load_state_dict(model_state) - # self.model_engine.load_checkpoint() + self.model_engine.load_checkpoint() self.optimizer.load_state_dict(training_state["optimizer"]) training_util.move_optimizer_to_cuda(self.optimizer) @@ -647,10 +619,7 @@ def from_partial_objects( model: Model, serialization_dir: str, data_loader: DataLoader, - fp16: DeepspeedFP16Config = DeepspeedFP16Config(), - amp: DeepspeedAMPConfig = DeepspeedAMPConfig(), - zero_allow_untested_optimizer: bool = True, - wall_clock_breakdown: bool = False, + deepspeed_config: DeepspeedConfig, validation_data_loader: DataLoader = None, local_rank: int = 0, @@ -663,14 +632,14 @@ def from_partial_objects( num_gradient_accumulation_steps: int = 1, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, - zero_optimizer: Lazy[ZeroOptimizer] = None, + deepspeed_optimizer: Dict[str, Any] = None, + deepspeed_args: Lazy[DeepspeedArgs] = None, tensorboard_writer: Lazy[TensorboardWriter] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = None, batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, ) -> "Trainer": - if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): @@ -684,31 +653,26 @@ def from_partial_objects( checkpointer_ = checkpointer.construct() or Checkpointer(serialization_dir) tensorboard_writer_ = tensorboard_writer.construct() or TensorboardWriter(serialization_dir) - optim_ = optimizer.construct(model_parameters=model.parameters()) # deepspeed only wants params, not names - model_engine = AllennlpDeepSpeedEngineAdapter( - args=Namespace(deepspeed_config=None, deepspeed=True, local_rank=local_rank), - model=model, - optimizer=optim_, - zero_optimizer=zero_optimizer, - model_parameters=model.parameters(), - dist_init_required=False, - config_params=dict( - fp=asdict(fp16), - amp=asdict(amp), - train_batch_size=data_loader.batch_size, - gradient_accumulation_steps=num_gradient_accumulation_steps, - zero_allow_untested_optimizer=zero_allow_untested_optimizer, - wall_clock_breakdown=wall_clock_breakdown, - tensorboard_enabled=False - ) + if deepspeed_config.optimizer: + optim_ = None + else: + optim_ = optimizer.construct(model_parameters=parameters) + + deepspeed_args = deepspeed_args.construct(local_rank=local_rank) or DeepspeedArgs(local_rank=local_rank) + model_engine, ds_optimizer = launch_deepspeed( + model, + optim_, + deepspeed_config, + deepspeed_args, + data_loader.batch_size, + num_gradient_accumulation_steps ) - ds_optimizer = model_engine.optimizer return cls( model, data_loader, deepspeed_engine=model_engine, - deepspeed_optimizer=ds_optimizer, + # optimizer=ds_optimizer, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_data_loader, @@ -727,16 +691,3 @@ def from_partial_objects( ) -def launch_ds( - model: torch.nn.Module, - optimizer, - zero_optim, - fp16: DeepspeedFP16Config, - amp: DeepspeedAMPConfig, - local_rank: int, - batch_size: int, - gradient_accumulation_steps: int, - zero_allow_untested_optimizer: bool, - wall_clock_breakdown: bool -): - return ds, ds.optimizer \ No newline at end of file diff --git a/allennlp/training/deepspeed/utils.py b/allennlp/training/deepspeed/utils.py new file mode 100644 index 00000000000..4be22a58d8b --- /dev/null +++ b/allennlp/training/deepspeed/utils.py @@ -0,0 +1,39 @@ +import logging +from deepspeed.utils import logger as ds_logger +ds_logger.setLevel(logging.WARNING) +ds_logger.propagate = False + +import torch +from allennlp.models.model import Model +from allennlp.common import Lazy +from allennlp.common.checks import ConfigurationError +from allennlp.training.deepspeed.config import DeepspeedConfig, DeepspeedArgs + +import deepspeed +from deepspeed.runtime.engine import DeepSpeedEngine + +def launch_deepspeed( + model: Model, + optimizer: torch.optim.Optimizer, + config: DeepspeedConfig, + args: Lazy[DeepspeedArgs], + batch_size: int, + gradient_accumulation_steps: int, +): + if not(optimizer is None or config.optimizer is None): + raise ConfigurationError(f"Cannot provide both optimizer and deepspeed_optimizer. {optimizer, config.to_dict()}") + + config = dict(**config.to_dict(), train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) + ds = DeepSpeedEngine( + args=args, + model=model, + optimizer=optimizer, + model_parameters=model.parameters(), + dist_init_required=False, + config_params=config + ) + if hasattr(ds, 'timers'): + def mute_log(*args, **kwargs): + pass + ds.timers.log = mute_log + return ds, ds.optimizer \ No newline at end of file From 498d3a2732f1078ca719235f957dd7be9cd0736c Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Mon, 2 Nov 2020 15:50:49 -0500 Subject: [PATCH 08/20] checkpointing works e2e --- allennlp/training/deepspeed/checkpointer.py | 131 ++++++++++++++++++++ allennlp/training/deepspeed/trainer.py | 54 ++++---- 2 files changed, 162 insertions(+), 23 deletions(-) create mode 100644 allennlp/training/deepspeed/checkpointer.py diff --git a/allennlp/training/deepspeed/checkpointer.py b/allennlp/training/deepspeed/checkpointer.py new file mode 100644 index 00000000000..2742c56b7de --- /dev/null +++ b/allennlp/training/deepspeed/checkpointer.py @@ -0,0 +1,131 @@ +from typing import Union, Dict, Any, List, Tuple, Optional + +import logging +import os +import re +import shutil +import time + +from pathlib import Path + +import torch + +import allennlp +from allennlp.nn import util as nn_util +from allennlp.training import util as training_util, Checkpointer + +logger = logging.getLogger(__name__) +_DeepspeedTrainer = "allennlp.training.deepspeed.trainer.DeepspeedTrainer" + + +class DeepspeedCheckpointer(Checkpointer): + # def maybe_save_checkpoint( + # self, + # trainer: _DeepspeedTrainer, + # epoch: int, + # batches_this_epoch: int + # ) -> None: + # 0/0 + + def save_checkpoint( + self, + epoch: Union[int, str], + trainer: _DeepspeedTrainer, + is_best_so_far: bool = False, + save_model_only=False, + ) -> None: + if self._serialization_dir is None: + return + + with trainer.get_checkpoint_state() as state: + model_engine, model_state, training_states = state + + checkpoint_id = "deepspeed_epoch_{}".format(epoch) + model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}".format(epoch)) + model_engine.save_checkpoint(self._serialization_dir, checkpoint_id) + + # TODO + # Model will need a weight file to load; + # not sure if ZeRO stage 2 will mess this up + if not os.path.isfile(model_path): + torch.save(model_state, model_path) + if save_model_only: + return + + training_path = os.path.join( + self._serialization_dir, "training_state_epoch_{}.th".format(epoch) + ) + if not os.path.isfile(training_path): + torch.save({**training_states, "epoch": epoch}, training_path) + + # The main checkpointing logic is now done, this is just shuffling files around, to keep + # track of best weights, and to remove old checkpoints, if desired. + if is_best_so_far: + logger.info( + "Best validation performance so far. Copying weights to '%s/best.th'.", + self._serialization_dir, + ) + shutil.copyfile(model_path, os.path.join(self._serialization_dir, "best.th")) + + engine_dir = os.path.join(self._serialization_dir, "best_deepspeed") + shutil.rmtree(engine_dir, ignore_errors=True) # in case no previous checkpoints + shutil.copytree(os.path.join(self._serialization_dir, checkpoint_id), engine_dir) + + if ( + self._num_serialized_models_to_keep is not None + and self._num_serialized_models_to_keep >= 0 + ): + self._serialized_paths.append((time.time(), model_path, training_path)) + if len(self._serialized_paths) > self._num_serialized_models_to_keep: + paths_to_remove = self._serialized_paths.pop(0) + # Check to see if we should keep this checkpoint, if it has been longer + # then self._keep_serialized_model_every_num_seconds since the last + # kept checkpoint. + remove_path = True + if self._keep_serialized_model_every_num_seconds is not None: + save_time = paths_to_remove[0] + time_since_checkpoint_kept = ( + save_time - self._last_permanent_saved_checkpoint_time + ) + if ( + time_since_checkpoint_kept + > self._keep_serialized_model_every_num_seconds + ): + # We want to keep this checkpoint. + remove_path = False + self._last_permanent_saved_checkpoint_time = save_time + if remove_path: + for fname in paths_to_remove[1:]: + if os.path.isfile(fname): + os.remove(fname) + + def find_latest_checkpoint(self) -> Optional[Tuple[str, str]]: + latest = super().find_latest_checkpoint() + if not latest: + return None + + model_path, training_state_path = latest + + checkpoints = (self._serialization_dir and Path(self._serialization_dir).glob('deepspeed_epoch_*')) or [] + checkpoints = sorted(c for c in checkpoints if c.is_dir()) + if not checkpoints: + return None + + engine_path = checkpoints[-1] + return engine_path, model_path, training_state_path + + def restore_checkpoint(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: + latest_checkpoint = self.find_latest_checkpoint() + + if latest_checkpoint is None: + # No checkpoint to restore, start at 0 + return {}, {}, {} + + checkpoint_id, model_path, training_state_path = latest_checkpoint + + model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1)) + training_state = torch.load(training_state_path, map_location=nn_util.device_mapping(-1)) + return checkpoint_id, model_state, training_state + + def best_model_state(self) -> Dict[str, Any]: + pass \ No newline at end of file diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py index 3c071b1a90e..896e8ef87d4 100644 --- a/allennlp/training/deepspeed/trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -30,9 +30,10 @@ from allennlp.training.optimizers import Optimizer from allennlp.training.tensorboard_writer import TensorboardWriter -from allennlp.training.trainer import Trainer, BatchCallback, EpochCallback +from allennlp.training.trainer import Trainer, BatchCallback, EpochCallback, TrainerCallback from allennlp.training.deepspeed.config import DeepspeedConfig, DeepspeedArgs +from allennlp.training.deepspeed.checkpointer import DeepspeedCheckpointer from allennlp.training.deepspeed.utils import launch_deepspeed logger = logging.getLogger(__name__) @@ -55,6 +56,8 @@ def __init__( moving_average: Optional[MovingAverage] = None, batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, + end_callbacks: List[EpochCallback] = None, + trainer_callbacks: List[TrainerCallback] = None, distributed: bool = False, local_rank: int = 0, world_size: int = 1, @@ -91,11 +94,17 @@ def __init__( if checkpointer is not None: self._checkpointer = checkpointer else: - self._checkpointer = Checkpointer(serialization_dir) + self._checkpointer = DeepspeedCheckpointer(serialization_dir) self._moving_average = moving_average self._batch_callbacks = batch_callbacks or [] self._epoch_callbacks = epoch_callbacks or [] + self._end_callbacks = end_callbacks or [] + + for callback in trainer_callbacks or []: + self._batch_callbacks.append(callback.batch()) + self._epoch_callbacks.append(callback.epoch()) + self._end_callbacks.append(callback.end()) # We keep the total batch number as an instance variable because it # is used inside a closure for the hook which logs activations in @@ -144,13 +153,13 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) cpu_memory_usage = [] - for worker, memory in common_util.peak_memory_mb().items(): + for worker, memory in common_util.peak_cpu_memory().items(): cpu_memory_usage.append((worker, memory)) - logger.info(f"Worker {worker} memory usage MB: {memory}") + logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}") gpu_memory_usage = [] - for gpu, memory in common_util.gpu_memory_mb().items(): + for gpu, memory in common_util.peak_gpu_memory().items(): gpu_memory_usage.append((gpu, memory)) - logger.info(f"GPU {gpu} memory usage MB: {memory}") + logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") regularization_penalty = self.model.get_regularization_penalty() @@ -283,9 +292,9 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: ) for (worker, memory) in cpu_memory_usage: - metrics["worker_" + str(worker) + "_memory_MB"] = memory + metrics["worker_" + str(worker) + "_memory_MB"] = memory / (1024 * 1024) for (gpu_num, memory) in gpu_memory_usage: - metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory + metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) return metrics def _validation_loss(self, epoch: int) -> Tuple[float, float, int]: @@ -506,14 +515,10 @@ def train(self) -> Dict[str, Any]: ) - if False and self._master: - self._checkpointer.save_checkpoint( - epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() - ) - - # Wait for the master to finish saving the checkpoint - if self._distributed: - dist.barrier() + # deepspeed checkpointing handles master / dist.barrier calls + self._checkpointer.save_checkpoint( + epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() + ) for callback in self._epoch_callbacks: callback(self, metrics=metrics, epoch=epoch, is_master=self._master) @@ -558,7 +563,7 @@ def get_checkpoint_state(self) -> Iterator[Tuple[Dict[str, Any], Dict[str, Any]] } try: - yield model_state, training_states + yield self.model_engine, model_state, training_states finally: if self._moving_average is not None: self._moving_average.restore() @@ -578,16 +583,16 @@ def _restore_checkpoint(self) -> int: The epoch at which to resume training, which should be one after the epoch in the saved training state. """ - model_state, training_state = self._checkpointer.restore_checkpoint() + checkpoint_id, model_state, training_state = self._checkpointer.restore_checkpoint() if not training_state: # No checkpoint to restore, start at 0 return 0 self.model.load_state_dict(model_state) - self.model_engine.load_checkpoint() - self.optimizer.load_state_dict(training_state["optimizer"]) - training_util.move_optimizer_to_cuda(self.optimizer) + self.model_engine.load_checkpoint(self._serialization_dir, checkpoint_id) + # self.optimizer.load_state_dict(training_state["optimizer"]) + # training_util.move_optimizer_to_cuda(self.optimizer) # Currently the `training_state` contains a serialized `MetricTracker`. if "metric_tracker" in training_state: @@ -639,6 +644,8 @@ def from_partial_objects( checkpointer: Lazy[Checkpointer] = None, batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, + end_callbacks: List[EpochCallback] = None, + trainer_callbacks: List[TrainerCallback] = None, ) -> "Trainer": if no_grad: for name, parameter in model.named_parameters(): @@ -650,7 +657,7 @@ def from_partial_objects( parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] moving_average_ = moving_average.construct(parameters=parameters) - checkpointer_ = checkpointer.construct() or Checkpointer(serialization_dir) + checkpointer_ = checkpointer.construct() or DeepspeedCheckpointer(serialization_dir) tensorboard_writer_ = tensorboard_writer.construct() or TensorboardWriter(serialization_dir) if deepspeed_config.optimizer: @@ -672,7 +679,6 @@ def from_partial_objects( model, data_loader, deepspeed_engine=model_engine, - # optimizer=ds_optimizer, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_data_loader, @@ -684,6 +690,8 @@ def from_partial_objects( moving_average=moving_average_, batch_callbacks=batch_callbacks, epoch_callbacks=epoch_callbacks, + end_callbacks=end_callbacks, + trainer_callbacks=trainer_callbacks, distributed=distributed, local_rank=local_rank, world_size=world_size, From a211b5e9e199a1d41b02aa685f02b499a9bd1aa6 Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Thu, 5 Nov 2020 13:06:12 -0500 Subject: [PATCH 09/20] ready for review --- allennlp/training/deepspeed/config.py | 6 ++++++ allennlp/training/deepspeed/sparse_transformer_embedder.py | 1 + allennlp/training/deepspeed/utils.py | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/allennlp/training/deepspeed/config.py b/allennlp/training/deepspeed/config.py index 35a08bb55e1..afecd5a1bf9 100644 --- a/allennlp/training/deepspeed/config.py +++ b/allennlp/training/deepspeed/config.py @@ -23,6 +23,11 @@ class DeepspeedOptimizerConfig(FromParams): type: str params: Dict[str, Any] +@dataclass +class DeepspeedLRSchedulerConfig(FromParams): + type: str + params: Dict[str, Any] + class DeepspeedZeROStage(IntEnum): DISABLED = 0 OPTIMIZER = 1 @@ -46,6 +51,7 @@ class DeepspeedConfig(FromParams): fp16: DeepspeedFP16Config amp: DeepspeedAMPConfig = DeepspeedAMPConfig() optimizer: DeepspeedOptimizerConfig = None + scheduler: DeepspeedLRSchedulerConfig = None zero_allow_untested_optimizer: bool = True wall_clock_breakdown: bool = False diff --git a/allennlp/training/deepspeed/sparse_transformer_embedder.py b/allennlp/training/deepspeed/sparse_transformer_embedder.py index 2207bdfa794..65981a6b4b9 100644 --- a/allennlp/training/deepspeed/sparse_transformer_embedder.py +++ b/allennlp/training/deepspeed/sparse_transformer_embedder.py @@ -3,6 +3,7 @@ from deepspeed.ops.sparse_attention.sparse_attention_utils import SparseAttentionUtils +# Doesn't work yet @TokenEmbedder.register('sparse_transformer') class SparseTransformerEmbedder(PretrainedTransformerEmbedder): class __init__(self, **kwargs): diff --git a/allennlp/training/deepspeed/utils.py b/allennlp/training/deepspeed/utils.py index 4be22a58d8b..c1d3f380e8e 100644 --- a/allennlp/training/deepspeed/utils.py +++ b/allennlp/training/deepspeed/utils.py @@ -23,7 +23,7 @@ def launch_deepspeed( if not(optimizer is None or config.optimizer is None): raise ConfigurationError(f"Cannot provide both optimizer and deepspeed_optimizer. {optimizer, config.to_dict()}") - config = dict(**config.to_dict(), train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) + config = dict(**{k: v for k, v in config.to_dict().items() if v is not None}, train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) ds = DeepSpeedEngine( args=args, model=model, From fdd888b080f8bf46726a0e2f2a9f4c17b863cb3c Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Thu, 5 Nov 2020 13:26:51 -0500 Subject: [PATCH 10/20] add new trainer/lazy changes --- allennlp/training/deepspeed/trainer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py index 896e8ef87d4..a950656aafb 100644 --- a/allennlp/training/deepspeed/trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -638,10 +638,10 @@ def from_partial_objects( no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, deepspeed_optimizer: Dict[str, Any] = None, - deepspeed_args: Lazy[DeepspeedArgs] = None, - tensorboard_writer: Lazy[TensorboardWriter] = None, + deepspeed_args: Lazy[DeepspeedArgs] = Lazy(DeepspeedArgs), + tensorboard_writer: Lazy[TensorboardWriter] = Lazy(TensorboardWriter), moving_average: Lazy[MovingAverage] = None, - checkpointer: Lazy[Checkpointer] = None, + checkpointer: Lazy[Checkpointer] = Lazy(DeepspeedCheckpointer), batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, end_callbacks: List[EpochCallback] = None, @@ -655,10 +655,12 @@ def from_partial_objects( common_util.log_frozen_and_tunable_parameter_names(model) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] - moving_average_ = moving_average.construct(parameters=parameters) + moving_average_ = ( + None if moving_average is None else moving_average.construct(parameters=parameters) + ) - checkpointer_ = checkpointer.construct() or DeepspeedCheckpointer(serialization_dir) - tensorboard_writer_ = tensorboard_writer.construct() or TensorboardWriter(serialization_dir) + checkpointer_ = checkpointer.construct(serialization_dir=serialization_dir) + tensorboard_writer_ = tensorboard_writer.construct(serialization_dir=serialization_dir) if deepspeed_config.optimizer: optim_ = None From 083a6d065b7fdd3c8036d7ab161075e678579523 Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Mon, 23 Nov 2020 09:46:54 -0500 Subject: [PATCH 11/20] dangling changes --- allennlp/training/deepspeed/checkpointer.py | 13 +------------ allennlp/training/deepspeed/trainer.py | 4 ---- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/allennlp/training/deepspeed/checkpointer.py b/allennlp/training/deepspeed/checkpointer.py index 2742c56b7de..aea3c63448b 100644 --- a/allennlp/training/deepspeed/checkpointer.py +++ b/allennlp/training/deepspeed/checkpointer.py @@ -19,14 +19,6 @@ class DeepspeedCheckpointer(Checkpointer): - # def maybe_save_checkpoint( - # self, - # trainer: _DeepspeedTrainer, - # epoch: int, - # batches_this_epoch: int - # ) -> None: - # 0/0 - def save_checkpoint( self, epoch: Union[int, str], @@ -125,7 +117,4 @@ def restore_checkpoint(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1)) training_state = torch.load(training_state_path, map_location=nn_util.device_mapping(-1)) - return checkpoint_id, model_state, training_state - - def best_model_state(self) -> Dict[str, Any]: - pass \ No newline at end of file + return checkpoint_id, model_state, training_state \ No newline at end of file diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py index a950656aafb..406a0480876 100644 --- a/allennlp/training/deepspeed/trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -204,9 +204,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: self._batch_num_total += 1 batch_num_total = self._batch_num_total - # with LineProfiler(self.batch_outputs, target_gpu=self.model_engine.device) as prof: batch_outputs = self.batch_outputs(batch, for_training=True) - # prof.print_stats() # display() loss = batch_outputs.get("loss") reg_loss = batch_outputs.get("reg_loss") @@ -591,8 +589,6 @@ def _restore_checkpoint(self) -> int: self.model.load_state_dict(model_state) self.model_engine.load_checkpoint(self._serialization_dir, checkpoint_id) - # self.optimizer.load_state_dict(training_state["optimizer"]) - # training_util.move_optimizer_to_cuda(self.optimizer) # Currently the `training_state` contains a serialized `MetricTracker`. if "metric_tracker" in training_state: From 4e4f7d748676a52b87441e8b8cd71614dc040555 Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Mon, 30 Nov 2020 09:56:53 -0500 Subject: [PATCH 12/20] updating from master --- allennlp/training/__init__.py | 2 +- allennlp/training/deepspeed/__init__.py | 15 +- allennlp/training/deepspeed/checkpointer.py | 35 +- allennlp/training/deepspeed/config.py | 19 +- allennlp/training/deepspeed/optimizers.py | 39 +-- .../deepspeed/sparse_transformer_embedder.py | 11 - allennlp/training/deepspeed/trainer.py | 330 ++++++------------ allennlp/training/deepspeed/utils.py | 39 --- 8 files changed, 156 insertions(+), 334 deletions(-) delete mode 100644 allennlp/training/deepspeed/sparse_transformer_embedder.py delete mode 100644 allennlp/training/deepspeed/utils.py diff --git a/allennlp/training/__init__.py b/allennlp/training/__init__.py index 23e042c372f..662eab06197 100644 --- a/allennlp/training/__init__.py +++ b/allennlp/training/__init__.py @@ -15,4 +15,4 @@ # try: # from allennlp.training.deepspeed import DeepspeedTrainer # except ImportError: -# warnings.warn('Deepspeed plugin not installed. Ignoring.') \ No newline at end of file +# warnings.warn('Deepspeed plugin not installed. Ignoring.') diff --git a/allennlp/training/deepspeed/__init__.py b/allennlp/training/deepspeed/__init__.py index 3a3637f660d..5c709ed7356 100644 --- a/allennlp/training/deepspeed/__init__.py +++ b/allennlp/training/deepspeed/__init__.py @@ -1,11 +1,8 @@ from allennlp.training.deepspeed.trainer import DeepspeedTrainer -from allennlp.training.deepspeed.optimizers import ( - FusedAdamOptimizer, - DeepspeedCPUAdamOptimizer, - FusedLambOptimizer -) -try: - from allennlp.training.deepspeed.sparse_transformer_embedder import SparseTransformerEmbedder -except ImportError: - pass \ No newline at end of file +# from allennlp.training.deepspeed.optimizers import ( +# FusedAdamOptimizer, +# DeepspeedCPUAdamOptimizer, +# FusedLambOptimizer +# ) +from allennlp.training.deepspeed import optimizers diff --git a/allennlp/training/deepspeed/checkpointer.py b/allennlp/training/deepspeed/checkpointer.py index aea3c63448b..982f174ccd9 100644 --- a/allennlp/training/deepspeed/checkpointer.py +++ b/allennlp/training/deepspeed/checkpointer.py @@ -1,10 +1,10 @@ -from typing import Union, Dict, Any, List, Tuple, Optional +from typing import Union, Dict, Any, Tuple, Optional, Iterable import logging import os -import re import shutil import time +import overrides from pathlib import Path @@ -12,35 +12,35 @@ import allennlp from allennlp.nn import util as nn_util -from allennlp.training import util as training_util, Checkpointer +from allennlp.training import Checkpointer logger = logging.getLogger(__name__) -_DeepspeedTrainer = "allennlp.training.deepspeed.trainer.DeepspeedTrainer" class DeepspeedCheckpointer(Checkpointer): + @overrides def save_checkpoint( self, epoch: Union[int, str], - trainer: _DeepspeedTrainer, + trainer: "allennlp.training.deepspeed.trainer.DeepspeedTrainer", is_best_so_far: bool = False, - save_model_only=False, + save_model_only: bool = False, ) -> None: if self._serialization_dir is None: return with trainer.get_checkpoint_state() as state: model_engine, model_state, training_states = state - + checkpoint_id = "deepspeed_epoch_{}".format(epoch) model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}".format(epoch)) model_engine.save_checkpoint(self._serialization_dir, checkpoint_id) # TODO - # Model will need a weight file to load; + # Model will need a weight file to load; # not sure if ZeRO stage 2 will mess this up if not os.path.isfile(model_path): - torch.save(model_state, model_path) + torch.save(model_state, model_path) if save_model_only: return @@ -58,9 +58,9 @@ def save_checkpoint( self._serialization_dir, ) shutil.copyfile(model_path, os.path.join(self._serialization_dir, "best.th")) - + engine_dir = os.path.join(self._serialization_dir, "best_deepspeed") - shutil.rmtree(engine_dir, ignore_errors=True) # in case no previous checkpoints + shutil.rmtree(engine_dir, ignore_errors=True) # in case no previous checkpoints shutil.copytree(os.path.join(self._serialization_dir, checkpoint_id), engine_dir) if ( @@ -79,10 +79,7 @@ def save_checkpoint( time_since_checkpoint_kept = ( save_time - self._last_permanent_saved_checkpoint_time ) - if ( - time_since_checkpoint_kept - > self._keep_serialized_model_every_num_seconds - ): + if time_since_checkpoint_kept > self._keep_serialized_model_every_num_seconds: # We want to keep this checkpoint. remove_path = False self._last_permanent_saved_checkpoint_time = save_time @@ -91,6 +88,7 @@ def save_checkpoint( if os.path.isfile(fname): os.remove(fname) + @overrides def find_latest_checkpoint(self) -> Optional[Tuple[str, str]]: latest = super().find_latest_checkpoint() if not latest: @@ -98,7 +96,9 @@ def find_latest_checkpoint(self) -> Optional[Tuple[str, str]]: model_path, training_state_path = latest - checkpoints = (self._serialization_dir and Path(self._serialization_dir).glob('deepspeed_epoch_*')) or [] + checkpoints: Iterable[Path] = ( + self._serialization_dir and Path(self._serialization_dir).glob("deepspeed_epoch_*") + ) or [] checkpoints = sorted(c for c in checkpoints if c.is_dir()) if not checkpoints: return None @@ -106,6 +106,7 @@ def find_latest_checkpoint(self) -> Optional[Tuple[str, str]]: engine_path = checkpoints[-1] return engine_path, model_path, training_state_path + @overrides def restore_checkpoint(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: latest_checkpoint = self.find_latest_checkpoint() @@ -117,4 +118,4 @@ def restore_checkpoint(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1)) training_state = torch.load(training_state_path, map_location=nn_util.device_mapping(-1)) - return checkpoint_id, model_state, training_state \ No newline at end of file + return checkpoint_id, model_state, training_state diff --git a/allennlp/training/deepspeed/config.py b/allennlp/training/deepspeed/config.py index afecd5a1bf9..f067febc473 100644 --- a/allennlp/training/deepspeed/config.py +++ b/allennlp/training/deepspeed/config.py @@ -1,4 +1,4 @@ -from typing import Dict, Any +from typing import Dict, Any, Optional from enum import IntEnum from allennlp.common import FromParams from dataclasses import dataclass, asdict @@ -7,32 +7,37 @@ @dataclass class DeepspeedFP16Config(FromParams): enabled: bool = True - loss_scale: float = 0. + loss_scale: float = 0.0 initial_scale_power: int = 32 loss_scale_window: int = 1000 hysteresis: int = 2 - min_loss_scale: float = 1. + min_loss_scale: float = 1.0 + @dataclass class DeepspeedAMPConfig(FromParams): enabled: bool = False opt_level: str = "O1" + @dataclass class DeepspeedOptimizerConfig(FromParams): type: str params: Dict[str, Any] + @dataclass class DeepspeedLRSchedulerConfig(FromParams): type: str params: Dict[str, Any] + class DeepspeedZeROStage(IntEnum): DISABLED = 0 OPTIMIZER = 1 GRADIENT = 2 + @dataclass class DeepspeedZeROConfig(FromParams): stage: DeepspeedZeROStage = DeepspeedZeROStage.GRADIENT @@ -50,9 +55,9 @@ class DeepspeedConfig(FromParams): zero_optimization: DeepspeedZeROConfig fp16: DeepspeedFP16Config amp: DeepspeedAMPConfig = DeepspeedAMPConfig() - optimizer: DeepspeedOptimizerConfig = None - scheduler: DeepspeedLRSchedulerConfig = None - + optimizer: Optional[DeepspeedOptimizerConfig] = None + scheduler: Optional[DeepspeedLRSchedulerConfig] = None + zero_allow_untested_optimizer: bool = True wall_clock_breakdown: bool = False @@ -65,4 +70,4 @@ class DeepspeedArgs(FromParams): local_rank: int deepspeed: bool = True deepspeed_mpi: bool = False - deepspeed_config: str = None \ No newline at end of file + deepspeed_config: Optional[str] = None diff --git a/allennlp/training/deepspeed/optimizers.py b/allennlp/training/deepspeed/optimizers.py index 7eaa229a480..226adcb67ce 100644 --- a/allennlp/training/deepspeed/optimizers.py +++ b/allennlp/training/deepspeed/optimizers.py @@ -2,39 +2,13 @@ import torch -from apex.optimizers.fused_adam import FusedAdam from deepspeed.ops.adam import DeepSpeedCPUAdam from deepspeed.ops.lamb import FusedLamb -from deepspeed.runtime.fp16.onebit_adam import OnebitAdam + +# from deepspeed.runtime.fp16.onebit_adam import OnebitAdam from allennlp.training.optimizers import Optimizer, make_parameter_groups -@Optimizer.register("fused_adam") -class FusedAdamOptimizer(Optimizer, FusedAdam): - def __init__( - self, - model_parameters: List[Tuple[str, torch.nn.Parameter]], - parameter_groups: List[Tuple[List[str], Dict[str, Any]]] = None, - lr: float = 0.001, - betas: Tuple[float, float] = (0.9, 0.999), - eps: float = 1e-08, - weight_decay: float = 0.0, - amsgrad: bool = False, - bias_correction: bool =True, - adam_w_mode: bool = True, - set_grad_none: bool = True, - ): - super().__init__( - params=make_parameter_groups(model_parameters, parameter_groups), - lr=lr, - betas=betas, - eps=eps, - weight_decay=weight_decay, - amsgrad=amsgrad, - bias_correction=bias_correction, - adam_w_mode=adam_w_mode, - set_grad_none=set_grad_none, - ) # This does not currently work @Optimizer.register("cpu_adam") @@ -55,9 +29,10 @@ def __init__( betas=betas, eps=eps, weight_decay=weight_decay, - amsgrad=amsgrad + amsgrad=amsgrad, ) + @Optimizer.register("fused_lamb") class FusedLambOptimizer(Optimizer, FusedLamb): def __init__( @@ -70,9 +45,9 @@ def __init__( eps_inside_sqrt: bool = False, weight_decay: float = 0.0, amsgrad: bool = False, - max_grad_norm: float = 0., + max_grad_norm: float = 0.0, max_coeff: float = 10.0, - min_coeff: float = 0.01 + min_coeff: float = 0.01, ): super().__init__( params=make_parameter_groups(model_parameters, parameter_groups), @@ -84,4 +59,4 @@ def __init__( max_grad_norm=max_grad_norm, max_coeff=max_coeff, min_coeff=min_coeff, - ) \ No newline at end of file + ) diff --git a/allennlp/training/deepspeed/sparse_transformer_embedder.py b/allennlp/training/deepspeed/sparse_transformer_embedder.py deleted file mode 100644 index 65981a6b4b9..00000000000 --- a/allennlp/training/deepspeed/sparse_transformer_embedder.py +++ /dev/null @@ -1,11 +0,0 @@ -from allennlp.modules.token_embedders.token_embedder import TokenEmbedder -from allennlp.modules.token_embedders.pretrained_transformer_embedder import PretrainedTransformerEmbedder - -from deepspeed.ops.sparse_attention.sparse_attention_utils import SparseAttentionUtils - -# Doesn't work yet -@TokenEmbedder.register('sparse_transformer') -class SparseTransformerEmbedder(PretrainedTransformerEmbedder): - class __init__(self, **kwargs): - super().__init__(**kwargs) - self.transformer_model = SparseAttentionUtils.replace_model_self_attention_with_sparse_self_attention(self.transformer_model) \ No newline at end of file diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py index 406a0480876..d6476e3c834 100644 --- a/allennlp/training/deepspeed/trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -1,6 +1,5 @@ import datetime import logging -import math import os import re import time @@ -8,43 +7,48 @@ from contextlib import contextmanager from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from allennlp.common.util import int_to_device - import torch import torch.distributed as dist -from torch.cuda import amp -from torch.nn.parallel import DistributedDataParallel -from torch.nn.utils import clip_grad_norm_ -from allennlp.common import Lazy, Registrable, Tqdm +from deepspeed.runtime.engine import DeepSpeedEngine +from deepspeed.utils import logger as ds_logger + +from allennlp.common import Lazy, Tqdm from allennlp.common import util as common_util -from allennlp.common.checks import ConfigurationError, check_for_gpu +from allennlp.common.checks import ConfigurationError from allennlp.data import DataLoader from allennlp.data.dataloader import TensorDict from allennlp.models.model import Model from allennlp.nn import util as nn_util from allennlp.training import util as training_util from allennlp.training.checkpointer import Checkpointer -from allennlp.training.metric_tracker import MetricTracker from allennlp.training.moving_average import MovingAverage from allennlp.training.optimizers import Optimizer from allennlp.training.tensorboard_writer import TensorboardWriter -from allennlp.training.trainer import Trainer, BatchCallback, EpochCallback, TrainerCallback +from allennlp.training.trainer import ( + Trainer, + GradientDescentTrainer, + BatchCallback, + EpochCallback, + TrainerCallback, +) from allennlp.training.deepspeed.config import DeepspeedConfig, DeepspeedArgs from allennlp.training.deepspeed.checkpointer import DeepspeedCheckpointer -from allennlp.training.deepspeed.utils import launch_deepspeed logger = logging.getLogger(__name__) +ds_logger.setLevel(logging.WARNING) +ds_logger.propagate = False + @Trainer.register("deepspeed", constructor="from_partial_objects") -class DeepspeedTrainer(Trainer): +class DeepspeedTrainer(GradientDescentTrainer): def __init__( self, model: Model, data_loader: DataLoader, - deepspeed_engine: 'DeepSpeedEngine', + deepspeed_engine: DeepSpeedEngine, patience: Optional[int] = None, validation_metric: str = "-loss", validation_data_loader: DataLoader = None, @@ -61,66 +65,38 @@ def __init__( distributed: bool = False, local_rank: int = 0, world_size: int = 1, - num_gradient_accumulation_steps: int = 1 + num_gradient_accumulation_steps: int = 1, ) -> None: - super().__init__(serialization_dir, cuda_device, distributed, local_rank, world_size) - self.model = model + super().__init__( + model=model, + optimizer=deepspeed_engine.optimizer, + data_loader=data_loader, + patience=patience, + validation_metric=validation_metric, + validation_data_loader=validation_data_loader, + num_epochs=num_epochs, + serialization_dir=serialization_dir, + cuda_device=cuda_device, + tensorboard_writer=tensorboard_writer, + checkpointer=checkpointer, + moving_average=moving_average, + batch_callbacks=batch_callbacks, + epoch_callbacks=epoch_callbacks, + end_callbacks=end_callbacks, + trainer_callbacks=trainer_callbacks, + distributed=False, + local_rank=local_rank, + world_size=world_size, + num_gradient_accumulation_steps=num_gradient_accumulation_steps, + use_amp=False, + ) self.model_engine = deepspeed_engine - self.optimizer = deepspeed_engine.optimizer - - self.data_loader = data_loader - self._validation_data_loader = validation_data_loader - - if patience is None: # no early stopping - if validation_data_loader is not None: - logger.warning( - "You provided a validation dataset but patience was set to None, " - "meaning that early stopping is disabled" - ) - elif (not isinstance(patience, int)) or patience <= 0: - raise ConfigurationError( - '{} is an invalid value for "patience": it must be a positive integer ' - "or None (if you want to disable early stopping)".format(patience) - ) - - # For tracking is_best_so_far and should_stop_early - self._metric_tracker = MetricTracker(patience, validation_metric) - # Get rid of + or - - self._validation_metric = validation_metric[1:] - - self._num_epochs = num_epochs + self._distributed = True - if checkpointer is not None: - self._checkpointer = checkpointer - else: + if checkpointer is None and serialization_dir is not None: self._checkpointer = DeepspeedCheckpointer(serialization_dir) - self._moving_average = moving_average - self._batch_callbacks = batch_callbacks or [] - self._epoch_callbacks = epoch_callbacks or [] - self._end_callbacks = end_callbacks or [] - - for callback in trainer_callbacks or []: - self._batch_callbacks.append(callback.batch()) - self._epoch_callbacks.append(callback.epoch()) - self._end_callbacks.append(callback.end()) - - # We keep the total batch number as an instance variable because it - # is used inside a closure for the hook which logs activations in - # `_enable_activation_logging`. - self._batch_num_total = 0 - - self._tensorboard = tensorboard_writer or TensorboardWriter(serialization_dir) - self._tensorboard.get_batch_num_total = lambda: self._batch_num_total - self._tensorboard.enable_activation_logging(self.model) - - self._last_log = 0.0 # time of last logging - - self._num_gradient_accumulation_steps = num_gradient_accumulation_steps - - self._pytorch_model = self.model - def batch_outputs(self, batch: TensorDict, for_training: bool) -> Dict[str, torch.Tensor]: """ Does a forward pass on the given batch and returns the output dictionary that the model @@ -165,13 +141,9 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: train_loss = 0.0 batch_loss = 0.0 + train_reg_loss = None if regularization_penalty is None else 0.0 + batch_reg_loss = None if regularization_penalty is None else 0.0 - if regularization_penalty is not None: - train_reg_loss = 0.0 - batch_reg_loss = 0.0 - else: - train_reg_loss = None - batch_reg_loss = None # Set the model to "train" mode. self.model_engine.train() @@ -183,14 +155,13 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: num_training_batches: Union[int, float] len_data_loader = len(self.data_loader) num_training_batches = len_data_loader - + # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown - batch_generator_tqdm = batch_generator if self._master: - batch_generator_tqdm = Tqdm.tqdm( - batch_generator, total=num_training_batches - ) + batch_generator_tqdm = Tqdm.tqdm(batch_generator, total=num_training_batches) + else: + batch_generator_tqdm = batch_generator self._last_log = time.time() @@ -198,7 +169,6 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: if self._batch_num_total is None: self._batch_num_total = 0 - done_early = False for batch in batch_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 @@ -211,13 +181,12 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: if torch.isnan(loss): raise ValueError("nan loss encountered") - batch_loss = loss.item() + batch_loss = 0 if loss is None else loss.item() train_loss += batch_loss if reg_loss is not None: batch_reg_loss = reg_loss.item() - train_reg_loss += batch_reg_loss + train_reg_loss += batch_reg_loss # type: ignore - self.model_engine.backward(loss) self.model_engine.step() @@ -258,19 +227,21 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: self._tensorboard.log_batch( self.model, self.optimizer, - 0., # batch_grad_norm, + 0.0, metrics, batch, param_updates, ) - self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) - + if self._checkpointer is not None: + self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) + for callback in self._batch_callbacks: callback( self, batch, - batch_outputs, + [batch_outputs], + metrics, epoch, batches_this_epoch, is_training=True, @@ -295,123 +266,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) return metrics - def _validation_loss(self, epoch: int) -> Tuple[float, float, int]: - """ - Computes the validation loss. Returns it and the number of batches. - """ - logger.info("Validating") - - self.model_engine.eval() - - # Replace parameter values with the shadow values from the moving averages. - if self._moving_average is not None: - self._moving_average.assign_average_value() - - if self._validation_data_loader is not None: - validation_data_loader = self._validation_data_loader - else: - raise ConfigurationError( - "Validation results cannot be calculated without a validation_data_loader" - ) - - regularization_penalty = self.model.get_regularization_penalty() - - # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's - # progress is shown - if self._master: - val_generator_tqdm = Tqdm.tqdm(validation_data_loader) - else: - val_generator_tqdm = validation_data_loader - - batches_this_epoch = 0 - val_loss = 0 - val_batch_loss = 0 - if regularization_penalty is not None: - val_reg_loss = 0 - val_batch_reg_loss = 0 - else: - val_reg_loss = None - val_batch_reg_loss = None - done_early = False - for batch in val_generator_tqdm: - if self._distributed: - # Check whether the other workers have stopped already (due to differing amounts of - # data in each). If so, we can't proceed because we would hang when we hit the - # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor - # here because NCCL process groups apparently don't support BoolTensor. - done = torch.tensor(0, device=self.cuda_device) - torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) - if done.item() > 0: - done_early = True - logger.warning( - f"Worker {torch.distributed.get_rank()} finishing validation early! " - "This implies that there is an imbalance in your validation " - "data across the workers and that some amount of it will be " - "ignored. A small amount of this is fine, but a major imbalance " - "should be avoided. Note: This warning will appear unless your " - "data is perfectly balanced." - ) - break - - batch_outputs = self.batch_outputs(batch, for_training=False) - loss = batch_outputs.get("loss") - reg_loss = batch_outputs.get("reg_loss") - if loss is not None: - # You shouldn't necessarily have to compute a loss for validation, so we allow for - # `loss` to be None. We need to be careful, though - `batches_this_epoch` is - # currently only used as the divisor for the loss function, so we can safely only - # count those batches for which we actually have a loss. If this variable ever - # gets used for something else, we might need to change things around a bit. - batches_this_epoch += 1 - val_batch_loss = loss.detach().cpu().numpy() - val_loss += val_batch_loss - if reg_loss is not None: - val_batch_reg_loss = reg_loss.detach().cpu().numpy() - val_reg_loss += val_batch_reg_loss - - # Update the description with the latest metrics - val_metrics = training_util.get_metrics( - self.model, - val_loss, - val_reg_loss, - val_batch_loss, - val_batch_reg_loss, - batches_this_epoch, - world_size=self._world_size, - cuda_device=self.cuda_device, - ) - - description = training_util.description_from_metrics(val_metrics) - if self._master: - val_generator_tqdm.set_description(description, refresh=False) - - for callback in self._batch_callbacks: - callback( - self, - [batch], - [batch_outputs], - epoch, - batches_this_epoch, - is_training=False, - is_master=self._master, - ) - - if self._distributed and not done_early: - logger.warning( - f"Worker {torch.distributed.get_rank()} completed its entire epoch (validation)." - ) - # Indicate that we're done so that any workers that have remaining data stop validation early. - done = torch.tensor(1, device=self.cuda_device) - torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) - assert done.item() - - # Now restore the original parameter values. - if self._moving_average is not None: - self._moving_average.restore() - - return val_loss, val_reg_loss, batches_this_epoch - - def train(self) -> Dict[str, Any]: + def _try_train(self) -> Dict[str, Any]: """ Trains the supplied model with the supplied parameters. """ @@ -428,7 +283,7 @@ def train(self) -> Dict[str, Any]: logger.info("Beginning training.") val_metrics: Dict[str, float] = {} - this_epoch_val_metric: float = None + this_epoch_val_metric: float = 0.0 metrics: Dict[str, Any] = {} epochs_trained = 0 training_start_time = time.time() @@ -512,11 +367,11 @@ def train(self) -> Dict[str, Any]: os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), metrics ) - # deepspeed checkpointing handles master / dist.barrier calls - self._checkpointer.save_checkpoint( - epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() - ) + if self._checkpointer is not None: + self._checkpointer.save_checkpoint( + epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() + ) for callback in self._epoch_callbacks: callback(self, metrics=metrics, epoch=epoch, is_master=self._master) @@ -538,7 +393,9 @@ def train(self) -> Dict[str, Any]: self._tensorboard.close() # Load the best model state before returning - best_model_state = self._checkpointer.best_model_state() + best_model_state = ( + None if self._checkpointer is None else self._checkpointer.best_model_state() + ) if best_model_state: self.model.load_state_dict(best_model_state) @@ -581,6 +438,9 @@ def _restore_checkpoint(self) -> int: The epoch at which to resume training, which should be one after the epoch in the saved training state. """ + if self._checkpointer is None: + return 0 + checkpoint_id, model_state, training_state = self._checkpointer.restore_checkpoint() if not training_state: @@ -621,7 +481,6 @@ def from_partial_objects( serialization_dir: str, data_loader: DataLoader, deepspeed_config: DeepspeedConfig, - validation_data_loader: DataLoader = None, local_rank: int = 0, patience: int = None, @@ -632,7 +491,7 @@ def from_partial_objects( world_size: int = 1, num_gradient_accumulation_steps: int = 1, no_grad: List[str] = None, - optimizer: Lazy[Optimizer] = None, + optimizer: Lazy[Optimizer] = Lazy(Optimizer.default), deepspeed_optimizer: Dict[str, Any] = None, deepspeed_args: Lazy[DeepspeedArgs] = Lazy(DeepspeedArgs), tensorboard_writer: Lazy[TensorboardWriter] = Lazy(TensorboardWriter), @@ -642,7 +501,7 @@ def from_partial_objects( epoch_callbacks: List[EpochCallback] = None, end_callbacks: List[EpochCallback] = None, trainer_callbacks: List[TrainerCallback] = None, - ) -> "Trainer": + ) -> "GradientDescentTrainer": if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): @@ -662,15 +521,17 @@ def from_partial_objects( optim_ = None else: optim_ = optimizer.construct(model_parameters=parameters) - - deepspeed_args = deepspeed_args.construct(local_rank=local_rank) or DeepspeedArgs(local_rank=local_rank) - model_engine, ds_optimizer = launch_deepspeed( + + deepspeed_args_ = deepspeed_args.construct(local_rank=local_rank) or DeepspeedArgs( + local_rank=local_rank + ) + model_engine, ds_optimizer = _launch_deepspeed( model, optim_, deepspeed_config, - deepspeed_args, + deepspeed_args_, data_loader.batch_size, - num_gradient_accumulation_steps + num_gradient_accumulation_steps, ) return cls( @@ -690,10 +551,43 @@ def from_partial_objects( epoch_callbacks=epoch_callbacks, end_callbacks=end_callbacks, trainer_callbacks=trainer_callbacks, - distributed=distributed, + distributed=False, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, ) +def _launch_deepspeed( + model: Model, + optimizer: torch.optim.Optimizer, + deepspeed_config: DeepspeedConfig, + args: DeepspeedArgs, + batch_size: int, + gradient_accumulation_steps: int, +): + if not (optimizer is None or deepspeed_config.optimizer is None): + raise ConfigurationError( + f"Cannot provide both optimizer and deepspeed_optimizer. {optimizer, deepspeed_config.to_dict()}" + ) + + config: Dict[str, Any] = dict( + **{k: v for k, v in deepspeed_config.to_dict().items() if v is not None}, + train_batch_size=batch_size, + gradient_accumulation_steps=gradient_accumulation_steps, + ) + ds = DeepSpeedEngine( + args=args, + model=model, + optimizer=optimizer, + model_parameters=model.parameters(), + dist_init_required=False, + config_params=config, + ) + if hasattr(ds, "timers"): + + def mute_log(*args, **kwargs): + pass + + ds.timers.log = mute_log + return ds, ds.optimizer diff --git a/allennlp/training/deepspeed/utils.py b/allennlp/training/deepspeed/utils.py deleted file mode 100644 index c1d3f380e8e..00000000000 --- a/allennlp/training/deepspeed/utils.py +++ /dev/null @@ -1,39 +0,0 @@ -import logging -from deepspeed.utils import logger as ds_logger -ds_logger.setLevel(logging.WARNING) -ds_logger.propagate = False - -import torch -from allennlp.models.model import Model -from allennlp.common import Lazy -from allennlp.common.checks import ConfigurationError -from allennlp.training.deepspeed.config import DeepspeedConfig, DeepspeedArgs - -import deepspeed -from deepspeed.runtime.engine import DeepSpeedEngine - -def launch_deepspeed( - model: Model, - optimizer: torch.optim.Optimizer, - config: DeepspeedConfig, - args: Lazy[DeepspeedArgs], - batch_size: int, - gradient_accumulation_steps: int, -): - if not(optimizer is None or config.optimizer is None): - raise ConfigurationError(f"Cannot provide both optimizer and deepspeed_optimizer. {optimizer, config.to_dict()}") - - config = dict(**{k: v for k, v in config.to_dict().items() if v is not None}, train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) - ds = DeepSpeedEngine( - args=args, - model=model, - optimizer=optimizer, - model_parameters=model.parameters(), - dist_init_required=False, - config_params=config - ) - if hasattr(ds, 'timers'): - def mute_log(*args, **kwargs): - pass - ds.timers.log = mute_log - return ds, ds.optimizer \ No newline at end of file From f48ea19bf73da87c6a6e89874add7d1000f672cf Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Mon, 30 Nov 2020 10:48:38 -0500 Subject: [PATCH 13/20] typechecks passing! --- allennlp/training/deepspeed/checkpointer.py | 13 ++++---- allennlp/training/deepspeed/trainer.py | 35 ++++++--------------- 2 files changed, 17 insertions(+), 31 deletions(-) diff --git a/allennlp/training/deepspeed/checkpointer.py b/allennlp/training/deepspeed/checkpointer.py index 982f174ccd9..56aaf988033 100644 --- a/allennlp/training/deepspeed/checkpointer.py +++ b/allennlp/training/deepspeed/checkpointer.py @@ -4,7 +4,7 @@ import os import shutil import time -import overrides +from overrides import overrides from pathlib import Path @@ -30,7 +30,8 @@ def save_checkpoint( return with trainer.get_checkpoint_state() as state: - model_engine, model_state, training_states = state + model_state, training_states = state + model_engine = trainer.model_engine checkpoint_id = "deepspeed_epoch_{}".format(epoch) model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}".format(epoch)) @@ -89,7 +90,7 @@ def save_checkpoint( os.remove(fname) @overrides - def find_latest_checkpoint(self) -> Optional[Tuple[str, str]]: + def find_latest_checkpoint(self) -> Optional[Tuple[str, str, str]]: latest = super().find_latest_checkpoint() if not latest: return None @@ -104,15 +105,15 @@ def find_latest_checkpoint(self) -> Optional[Tuple[str, str]]: return None engine_path = checkpoints[-1] - return engine_path, model_path, training_state_path + return str(engine_path), model_path, training_state_path @overrides - def restore_checkpoint(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: + def restore_checkpoint(self) -> Tuple[int, Dict[str, Any], Dict[str, Any]]: latest_checkpoint = self.find_latest_checkpoint() if latest_checkpoint is None: # No checkpoint to restore, start at 0 - return {}, {}, {} + return -1, {}, {} checkpoint_id, model_path, training_state_path = latest_checkpoint diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py index d6476e3c834..1f66c747e64 100644 --- a/allennlp/training/deepspeed/trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -6,6 +6,7 @@ import traceback from contextlib import contextmanager from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from overrides import overrides import torch import torch.distributed as dist @@ -185,7 +186,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: train_loss += batch_loss if reg_loss is not None: batch_reg_loss = reg_loss.item() - train_reg_loss += batch_reg_loss # type: ignore + train_reg_loss += batch_reg_loss # type: ignore self.model_engine.backward(loss) self.model_engine.step() @@ -401,28 +402,6 @@ def _try_train(self) -> Dict[str, Any]: return metrics - @contextmanager - def get_checkpoint_state(self) -> Iterator[Tuple[Dict[str, Any], Dict[str, Any]]]: - if self._moving_average is not None: - # Assigning average value to model parameters. The checkpointer will call - # `restore_state_after_checkpointing` when it is done to put this back to what it was. - self._moving_average.assign_average_value() - - model_state = self.model.state_dict() - - # These are the training states we need to persist. - training_states = { - "metric_tracker": self._metric_tracker.state_dict(), - "optimizer": self.optimizer.state_dict(), - "batch_num_total": self._batch_num_total, - } - - try: - yield self.model_engine, model_state, training_states - finally: - if self._moving_average is not None: - self._moving_average.restore() - def _restore_checkpoint(self) -> int: """ Restores the model and training state from the last saved checkpoint. @@ -441,6 +420,7 @@ def _restore_checkpoint(self) -> int: if self._checkpointer is None: return 0 + self._checkpointer: DeepspeedCheckpointer checkpoint_id, model_state, training_state = self._checkpointer.restore_checkpoint() if not training_state: @@ -475,6 +455,7 @@ def _restore_checkpoint(self) -> int: return epoch_to_return @classmethod + @overrides def from_partial_objects( cls, model: Model, @@ -501,7 +482,7 @@ def from_partial_objects( epoch_callbacks: List[EpochCallback] = None, end_callbacks: List[EpochCallback] = None, trainer_callbacks: List[TrainerCallback] = None, - ) -> "GradientDescentTrainer": + ) -> "DeepspeedTrainer": if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): @@ -525,12 +506,16 @@ def from_partial_objects( deepspeed_args_ = deepspeed_args.construct(local_rank=local_rank) or DeepspeedArgs( local_rank=local_rank ) + + if not hasattr(data_loader, 'batch_size'): + raise ConfigurationError("Please specify your batch size in Deepspeed config if not using AllennlpDataLoader.") + model_engine, ds_optimizer = _launch_deepspeed( model, optim_, deepspeed_config, deepspeed_args_, - data_loader.batch_size, + data_loader.batch_size, # type: ignore num_gradient_accumulation_steps, ) From b3328fc03ec1fbd5907d580013e6404b5abb3804 Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Sun, 3 Jan 2021 11:25:04 -0500 Subject: [PATCH 14/20] init file --- allennlp/training/__init__.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/allennlp/training/__init__.py b/allennlp/training/__init__.py index 662eab06197..27b2b5f6d91 100644 --- a/allennlp/training/__init__.py +++ b/allennlp/training/__init__.py @@ -9,10 +9,4 @@ TrainerCallback, TrackEpochCallback, ) -from allennlp.training.deepspeed import DeepspeedTrainer - -# import warnings -# try: -# from allennlp.training.deepspeed import DeepspeedTrainer -# except ImportError: -# warnings.warn('Deepspeed plugin not installed. Ignoring.') +from allennlp.training.deepspeed import DeepspeedTrainer # TODO: make this optional \ No newline at end of file From 2fdb7c0382a15de940fdd7ddeb7306dc002879f4 Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Fri, 8 Jan 2021 11:16:45 -0500 Subject: [PATCH 15/20] save old tests in case --- allennlp/training/deepspeed/trainer.py | 107 ++++++++++++++++--------- tests/commands/train_test.py | 55 +++++++++++++ tests/training/trainer_test.py | 107 +++++++++++++++++++++++++ 3 files changed, 229 insertions(+), 40 deletions(-) diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py index 1f66c747e64..34503559181 100644 --- a/allennlp/training/deepspeed/trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -95,6 +95,7 @@ def __init__( self.model_engine = deepspeed_engine self._distributed = True + serialization_dir = None if checkpointer is None and serialization_dir is not None: self._checkpointer = DeepspeedCheckpointer(serialization_dir) @@ -129,6 +130,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) + logger.info(f"logging mem usg") cpu_memory_usage = [] for worker, memory in common_util.peak_cpu_memory().items(): cpu_memory_usage.append((worker, memory)) @@ -137,6 +139,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: for gpu, memory in common_util.peak_gpu_memory().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") + logger.info(f"done logging mem usg") regularization_penalty = self.model.get_regularization_penalty() @@ -268,9 +271,6 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: return metrics def _try_train(self) -> Dict[str, Any]: - """ - Trains the supplied model with the supplied parameters. - """ try: epoch_counter = self._restore_checkpoint() except RuntimeError: @@ -281,6 +281,8 @@ def _try_train(self) -> Dict[str, Any]: "directory?" ) + training_util.enable_gradient_clipping(self.model, self._grad_clipping) + logger.info("Beginning training.") val_metrics: Dict[str, float] = {} @@ -298,8 +300,18 @@ def _try_train(self) -> Dict[str, Any]: for epoch in range(epoch_counter, self._num_epochs): epoch_start_time = time.time() + logger.info("Training epoch.") train_metrics = self._train_epoch(epoch) + # if self._master and self._checkpointer is not None: + # self._checkpointer.save_checkpoint(epoch, self, save_model_only=True) + + # # Wait for the master to finish saving the model checkpoint + # if self._distributed: + # dist.barrier() + + logger.info("Passed start of epoch checkpoint barrier.") + # get peak of memory usage for key, value in train_metrics.items(): if key.startswith("gpu_") and key.endswith("_memory_MB"): @@ -317,6 +329,8 @@ def _try_train(self) -> Dict[str, Any]: if self._distributed: dist.barrier() + logger.info("Passed validation metrics barrier.") + val_metrics = training_util.get_metrics( self.model, val_loss, @@ -365,15 +379,29 @@ def _try_train(self) -> Dict[str, Any]: if self._serialization_dir and self._master: common_util.dump_metrics( - os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), metrics + os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), + metrics, ) - # deepspeed checkpointing handles master / dist.barrier calls - if self._checkpointer is not None: + # The Scheduler API is agnostic to whether your schedule requires a validation metric - + # if it doesn't, the validation metric passed here is ignored. + if self._learning_rate_scheduler: + self._learning_rate_scheduler.step(this_epoch_val_metric) + if self._momentum_scheduler: + self._momentum_scheduler.step(this_epoch_val_metric) + + if self._master and self._checkpointer is not None: self._checkpointer.save_checkpoint( epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() ) + logger.info("Starting end of epoch checkpoint barrier...") + # Wait for the master to finish saving the checkpoint + # if self._distributed: + # dist.barrier() + + logger.info("Passed end of epoch checkpoint barrier.") + for callback in self._epoch_callbacks: callback(self, metrics=metrics, epoch=epoch, is_master=self._master) @@ -390,8 +418,8 @@ def _try_train(self) -> Dict[str, Any]: epochs_trained += 1 - # make sure pending events are flushed to disk and files are closed properly - self._tensorboard.close() + for callback in self._end_callbacks: + callback(self, metrics=metrics, epoch=epoch, is_master=self._master) # Load the best model state before returning best_model_state = ( @@ -510,7 +538,7 @@ def from_partial_objects( if not hasattr(data_loader, 'batch_size'): raise ConfigurationError("Please specify your batch size in Deepspeed config if not using AllennlpDataLoader.") - model_engine, ds_optimizer = _launch_deepspeed( + model_engine = DeepspeedTrainer._build_engine( model, optim_, deepspeed_config, @@ -542,37 +570,36 @@ def from_partial_objects( num_gradient_accumulation_steps=num_gradient_accumulation_steps, ) + @staticmethod + def _build_engine( + model: Model, + optimizer: torch.optim.Optimizer, + deepspeed_config: DeepspeedConfig, + args: DeepspeedArgs, + batch_size: int, + num_gradient_accumulation_steps: int, + ): + if not (optimizer is None or deepspeed_config.optimizer is None): + raise ConfigurationError( + f"Cannot provide both optimizer and deepspeed_optimizer. {optimizer, deepspeed_config.to_dict()}" + ) -def _launch_deepspeed( - model: Model, - optimizer: torch.optim.Optimizer, - deepspeed_config: DeepspeedConfig, - args: DeepspeedArgs, - batch_size: int, - gradient_accumulation_steps: int, -): - if not (optimizer is None or deepspeed_config.optimizer is None): - raise ConfigurationError( - f"Cannot provide both optimizer and deepspeed_optimizer. {optimizer, deepspeed_config.to_dict()}" + config: Dict[str, Any] = dict( + **{k: v for k, v in deepspeed_config.to_dict().items() if v is not None}, + train_batch_size=batch_size, + gradient_accumulation_steps=num_gradient_accumulation_steps, + ) + ds = DeepSpeedEngine( + args=args, + model=model, + optimizer=optimizer, + model_parameters=model.parameters(), + dist_init_required=False, + config_params=config, ) + if hasattr(ds, "timers"): + def mute_log(*args, **kwargs): + pass - config: Dict[str, Any] = dict( - **{k: v for k, v in deepspeed_config.to_dict().items() if v is not None}, - train_batch_size=batch_size, - gradient_accumulation_steps=gradient_accumulation_steps, - ) - ds = DeepSpeedEngine( - args=args, - model=model, - optimizer=optimizer, - model_parameters=model.parameters(), - dist_init_required=False, - config_params=config, - ) - if hasattr(ds, "timers"): - - def mute_log(*args, **kwargs): - pass - - ds.timers.log = mute_log - return ds, ds.optimizer + ds.timers.log = mute_log + return ds diff --git a/tests/commands/train_test.py b/tests/commands/train_test.py index 75f4515390a..7e38c07be01 100644 --- a/tests/commands/train_test.py +++ b/tests/commands/train_test.py @@ -235,6 +235,61 @@ def test_train_model_distributed(self): # Check we can load the serialized model assert load_archive(out_dir).model + def test_train_model_deepspeed(self): + if torch.cuda.device_count() >= 2: + devices = [0, 1] + else: + devices = [-1, -1] + + params = lambda: Params( + { + "model": { + "type": "simple_tagger", + "text_field_embedder": { + "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} + }, + "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + }, + "dataset_reader": {"type": "sequence_tagging"}, + "train_data_path": SEQUENCE_TAGGING_DATA_PATH, + "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, + "data_loader": {"batch_size": 2}, + "trainer": { + "type": "deepspeed", + "deepspeed_config": { + "zero_optimization": { "stage": 2 }, + "fp16": { "enabled": True, }, + }, + "num_epochs": 2, + "optimizer": "adam" + }, + "distributed": {"cuda_devices": devices}, + } + ) + + out_dir = os.path.join(self.TEST_DIR, "test_distributed_train") + train_model(params(), serialization_dir=out_dir) + + # Check that some logs specific to distributed + # training are where we expect. + serialized_files = os.listdir(out_dir) + assert "out_worker0.log" in serialized_files + assert "out_worker1.log" in serialized_files + assert "model.tar.gz" in serialized_files + assert "metrics.json" in serialized_files + + # Make sure the metrics look right. + with open(os.path.join(out_dir, "metrics.json")) as f: + metrics = json.load(f) + assert metrics["peak_worker_0_memory_MB"] > 0 + assert metrics["peak_worker_1_memory_MB"] > 0 + if torch.cuda.device_count() >= 2: + assert metrics["peak_gpu_0_memory_MB"] > 0 + assert metrics["peak_gpu_1_memory_MB"] > 0 + + # Check we can load the serialized model + assert load_archive(out_dir).model + @cpu_or_gpu @pytest.mark.parametrize("lazy", [True, False]) def test_train_model_distributed_with_sharded_reader(self, lazy): diff --git a/tests/training/trainer_test.py b/tests/training/trainer_test.py index 1a2b00b83be..c138a13e787 100644 --- a/tests/training/trainer_test.py +++ b/tests/training/trainer_test.py @@ -1171,3 +1171,110 @@ def test_sparse_clip_grad(self): # Final norm should be 1.5 grad = embedding.weight.grad.coalesce() assert grad._values().norm(2.0).item() == pytest.approx(1.5, rel=1e-4) + + + +@requires_multi_gpu +class TestDeepspeedTrainer(TrainerTestBase): + @pytest.mark.parametrize( + "batch_size, num_gradient_accumulation_steps", [(32, 1)] + ) + def test_trainer_can_run_deepspeed(self, batch_size, num_gradient_accumulation_steps): + import torch.multiprocessing as mp + from allennlp.common import util as common_util + + node_rank = 0 + + device_ids = [int(d) for d in os.getenv('CUDA_VISIBLE_DEVICES').split(',')] + num_procs = len(device_ids) + + num_nodes = 1 + world_size = num_nodes * num_procs + + mp.spawn( + _test_worker, + args=( + self.TEST_DIR, + self.model, + self.optimizer, + self.data_loader, + batch_size, + num_gradient_accumulation_steps, + "127.0.0.1", + common_util.find_open_port(), + node_rank, + num_procs, + world_size, + ), + nprocs=num_procs, + ) + + +def _test_worker( + process_rank: int, + serialization_dir, + model, + optimizer, + data_loader, + batch_size, + num_gradient_accumulation_steps, + master_addr: str = "127.0.0.1", + master_port: int = 29500, + node_rank: int = 0, + num_procs_per_node: int = 1, + world_size: int = 1, +): + import torch.distributed as dist + from allennlp.commands.train import TrainModel + from allennlp.training.deepspeed.trainer import DeepspeedTrainer + from allennlp.training.deepspeed.config import ( + DeepspeedConfig, + DeepspeedArgs, + DeepspeedZeROConfig, + DeepspeedFP16Config + ) + + global_rank = node_rank * num_procs_per_node + process_rank + + # Number of processes per node is useful to know if a process + # is a master in the local node(node in which it is running) + os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node) + + backend = "nccl" if process_rank >= 0 else "gloo" + dist.init_process_group( + backend=backend, + init_method=f"tcp://{master_addr}:{master_port}", + world_size=world_size, + rank=global_rank, + ) + + engine = DeepspeedTrainer._build_engine( + model, + optimizer, + deepspeed_config=DeepspeedConfig( + zero_optimization=DeepspeedZeROConfig(), + fp16=DeepspeedFP16Config() + ), + args=DeepspeedArgs(local_rank=process_rank), + batch_size=batch_size, + num_gradient_accumulation_steps=num_gradient_accumulation_steps + ) + + trainer = DeepspeedTrainer( + model, + data_loader, + engine, + num_epochs=2, + num_gradient_accumulation_steps=num_gradient_accumulation_steps, + ) + train_loop = TrainModel( + serialization_dir=serialization_dir, + model=model, + trainer=trainer + ) + + _ = train_loop.run() + # metrics = train_loop.run() + # print(metrics) + import json + print(json.dumps(dict(os.environ), indent=2)) \ No newline at end of file From 95a9e5f41b5e17af1a31a5e0448c7bdede1f3966 Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Fri, 8 Jan 2021 14:35:21 -0500 Subject: [PATCH 16/20] tracking down dist barrier bug(s) --- allennlp/training/deepspeed/checkpointer.py | 2 + allennlp/training/deepspeed/trainer.py | 330 ++++++++++---------- tests/training/trainer_test.py | 109 +------ 3 files changed, 174 insertions(+), 267 deletions(-) diff --git a/allennlp/training/deepspeed/checkpointer.py b/allennlp/training/deepspeed/checkpointer.py index 56aaf988033..a8433b2e150 100644 --- a/allennlp/training/deepspeed/checkpointer.py +++ b/allennlp/training/deepspeed/checkpointer.py @@ -29,6 +29,7 @@ def save_checkpoint( if self._serialization_dir is None: return + # logger.info("Getting checkpoint state") with trainer.get_checkpoint_state() as state: model_state, training_states = state model_engine = trainer.model_engine @@ -42,6 +43,7 @@ def save_checkpoint( # not sure if ZeRO stage 2 will mess this up if not os.path.isfile(model_path): torch.save(model_state, model_path) + # logger.info("Saved model state") if save_model_only: return diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py index 34503559181..553038f4654 100644 --- a/allennlp/training/deepspeed/trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -95,7 +95,7 @@ def __init__( self.model_engine = deepspeed_engine self._distributed = True - serialization_dir = None + # serialization_dir = None if checkpointer is None and serialization_dir is not None: self._checkpointer = DeepspeedCheckpointer(serialization_dir) @@ -129,16 +129,17 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ + print(f'Rank {self._rank}: Starting epoch') logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) logger.info(f"logging mem usg") cpu_memory_usage = [] - for worker, memory in common_util.peak_cpu_memory().items(): - cpu_memory_usage.append((worker, memory)) - logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}") + # for worker, memory in common_util.peak_cpu_memory().items(): + # cpu_memory_usage.append((worker, memory)) + # logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}") gpu_memory_usage = [] - for gpu, memory in common_util.peak_gpu_memory().items(): - gpu_memory_usage.append((gpu, memory)) - logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") + # for gpu, memory in common_util.peak_gpu_memory().items(): + # gpu_memory_usage.append((gpu, memory)) + # logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") logger.info(f"done logging mem usg") regularization_penalty = self.model.get_regularization_penalty() @@ -177,6 +178,8 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total + if not self._master: + print(f'Rank {self._rank}: {batch_num_total}') batch_outputs = self.batch_outputs(batch, for_training=True) @@ -237,8 +240,8 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: param_updates, ) - if self._checkpointer is not None: - self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) + # if self._checkpointer is not None: + # self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) for callback in self._batch_callbacks: callback( @@ -252,6 +255,15 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: is_master=self._master, ) + if not self._master: + print(f'Rank {self._rank}: {batches_this_epoch}') + + if self._distributed: + dist.barrier() + + if not self._master: + print(f'Rank {self._rank}: Passed barrier') + metrics = training_util.get_metrics( self.model, train_loss, @@ -270,165 +282,165 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) return metrics - def _try_train(self) -> Dict[str, Any]: - try: - epoch_counter = self._restore_checkpoint() - except RuntimeError: - traceback.print_exc() - raise ConfigurationError( - "Could not recover training from the checkpoint. Did you mean to output to " - "a different serialization directory or delete the existing serialization " - "directory?" - ) - - training_util.enable_gradient_clipping(self.model, self._grad_clipping) - - logger.info("Beginning training.") - - val_metrics: Dict[str, float] = {} - this_epoch_val_metric: float = 0.0 - metrics: Dict[str, Any] = {} - epochs_trained = 0 - training_start_time = time.time() - - metrics["best_epoch"] = self._metric_tracker.best_epoch - for key, value in self._metric_tracker.best_epoch_metrics.items(): - metrics["best_validation_" + key] = value - - for callback in self._epoch_callbacks: - callback(self, metrics={}, epoch=-1, is_master=self._master) - - for epoch in range(epoch_counter, self._num_epochs): - epoch_start_time = time.time() - logger.info("Training epoch.") - train_metrics = self._train_epoch(epoch) - - # if self._master and self._checkpointer is not None: - # self._checkpointer.save_checkpoint(epoch, self, save_model_only=True) - - # # Wait for the master to finish saving the model checkpoint - # if self._distributed: - # dist.barrier() + # def _try_train(self) -> Dict[str, Any]: + # try: + # epoch_counter = self._restore_checkpoint() + # except RuntimeError: + # traceback.print_exc() + # raise ConfigurationError( + # "Could not recover training from the checkpoint. Did you mean to output to " + # "a different serialization directory or delete the existing serialization " + # "directory?" + # ) + + # training_util.enable_gradient_clipping(self.model, self._grad_clipping) + + # logger.info("Beginning training.") + + # val_metrics: Dict[str, float] = {} + # this_epoch_val_metric: float = 0.0 + # metrics: Dict[str, Any] = {} + # epochs_trained = 0 + # training_start_time = time.time() + + # metrics["best_epoch"] = self._metric_tracker.best_epoch + # for key, value in self._metric_tracker.best_epoch_metrics.items(): + # metrics["best_validation_" + key] = value + + # for callback in self._epoch_callbacks: + # callback(self, metrics={}, epoch=-1, is_master=self._master) + + # for epoch in range(epoch_counter, self._num_epochs): + # epoch_start_time = time.time() + # logger.info("Training epoch.") + # train_metrics = self._train_epoch(epoch) + + # if self._master and self._checkpointer is not None: + # self._checkpointer.save_checkpoint(epoch, self, save_model_only=True) + + # # # Wait for the master to finish saving the model checkpoint + # if self._distributed: + # dist.barrier() - logger.info("Passed start of epoch checkpoint barrier.") - - # get peak of memory usage - for key, value in train_metrics.items(): - if key.startswith("gpu_") and key.endswith("_memory_MB"): - metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) - elif key.startswith("worker_") and key.endswith("_memory_MB"): - metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) - - if self._validation_data_loader is not None: - with torch.no_grad(): - # We have a validation set, so compute all the metrics on it. - val_loss, val_reg_loss, num_batches = self._validation_loss(epoch) - - # It is safe again to wait till the validation is done. This is - # important to get the metrics right. - if self._distributed: - dist.barrier() - - logger.info("Passed validation metrics barrier.") - - val_metrics = training_util.get_metrics( - self.model, - val_loss, - val_reg_loss, - batch_loss=None, - batch_reg_loss=None, - num_batches=num_batches, - reset=True, - world_size=self._world_size, - cuda_device=self.cuda_device, - ) - - # Check validation metric for early stopping - this_epoch_val_metric = val_metrics[self._validation_metric] - self._metric_tracker.add_metric(this_epoch_val_metric) - - if self._metric_tracker.should_stop_early(): - logger.info("Ran out of patience. Stopping training.") - break - - if self._master: - self._tensorboard.log_metrics( - train_metrics, val_metrics=val_metrics, log_to_console=True, epoch=epoch + 1 - ) # +1 because tensorboard doesn't like 0 - - # Create overall metrics dict - training_elapsed_time = time.time() - training_start_time - metrics["training_duration"] = str(datetime.timedelta(seconds=training_elapsed_time)) - metrics["training_start_epoch"] = epoch_counter - metrics["training_epochs"] = epochs_trained - metrics["epoch"] = epoch - - for key, value in train_metrics.items(): - metrics["training_" + key] = value - for key, value in val_metrics.items(): - metrics["validation_" + key] = value - - if self._metric_tracker.is_best_so_far(): - # Update all the best_ metrics. - # (Otherwise they just stay the same as they were.) - metrics["best_epoch"] = epoch - for key, value in val_metrics.items(): - metrics["best_validation_" + key] = value - - self._metric_tracker.best_epoch_metrics = val_metrics - - if self._serialization_dir and self._master: - common_util.dump_metrics( - os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), - metrics, - ) - - # The Scheduler API is agnostic to whether your schedule requires a validation metric - - # if it doesn't, the validation metric passed here is ignored. - if self._learning_rate_scheduler: - self._learning_rate_scheduler.step(this_epoch_val_metric) - if self._momentum_scheduler: - self._momentum_scheduler.step(this_epoch_val_metric) - - if self._master and self._checkpointer is not None: - self._checkpointer.save_checkpoint( - epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() - ) - - logger.info("Starting end of epoch checkpoint barrier...") - # Wait for the master to finish saving the checkpoint - # if self._distributed: - # dist.barrier() + # # logger.info("Passed start of epoch checkpoint barrier.") + + # # get peak of memory usage + # for key, value in train_metrics.items(): + # if key.startswith("gpu_") and key.endswith("_memory_MB"): + # metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) + # elif key.startswith("worker_") and key.endswith("_memory_MB"): + # metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) + + # if self._validation_data_loader is not None: + # with torch.no_grad(): + # # We have a validation set, so compute all the metrics on it. + # val_loss, val_reg_loss, num_batches = self._validation_loss(epoch) + + # # It is safe again to wait till the validation is done. This is + # # important to get the metrics right. + # if self._distributed: + # dist.barrier() + + # # logger.info("Passed validation metrics barrier.") + + # val_metrics = training_util.get_metrics( + # self.model, + # val_loss, + # val_reg_loss, + # batch_loss=None, + # batch_reg_loss=None, + # num_batches=num_batches, + # reset=True, + # world_size=self._world_size, + # cuda_device=self.cuda_device, + # ) + + # # Check validation metric for early stopping + # this_epoch_val_metric = val_metrics[self._validation_metric] + # self._metric_tracker.add_metric(this_epoch_val_metric) + + # if self._metric_tracker.should_stop_early(): + # logger.info("Ran out of patience. Stopping training.") + # break + + # if self._master: + # self._tensorboard.log_metrics( + # train_metrics, val_metrics=val_metrics, log_to_console=True, epoch=epoch + 1 + # ) # +1 because tensorboard doesn't like 0 + + # # Create overall metrics dict + # training_elapsed_time = time.time() - training_start_time + # metrics["training_duration"] = str(datetime.timedelta(seconds=training_elapsed_time)) + # metrics["training_start_epoch"] = epoch_counter + # metrics["training_epochs"] = epochs_trained + # metrics["epoch"] = epoch + + # for key, value in train_metrics.items(): + # metrics["training_" + key] = value + # for key, value in val_metrics.items(): + # metrics["validation_" + key] = value + + # if self._metric_tracker.is_best_so_far(): + # # Update all the best_ metrics. + # # (Otherwise they just stay the same as they were.) + # metrics["best_epoch"] = epoch + # for key, value in val_metrics.items(): + # metrics["best_validation_" + key] = value + + # self._metric_tracker.best_epoch_metrics = val_metrics + + # if self._serialization_dir and self._master: + # common_util.dump_metrics( + # os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), + # metrics, + # ) + + # # The Scheduler API is agnostic to whether your schedule requires a validation metric - + # # if it doesn't, the validation metric passed here is ignored. + # if self._learning_rate_scheduler: + # self._learning_rate_scheduler.step(this_epoch_val_metric) + # if self._momentum_scheduler: + # self._momentum_scheduler.step(this_epoch_val_metric) + + # if self._master and self._checkpointer is not None: + # self._checkpointer.save_checkpoint( + # epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() + # ) + + # # logger.info("Starting end of epoch checkpoint barrier...") + # # Wait for the master to finish saving the checkpoint + # if self._distributed: + # dist.barrier() - logger.info("Passed end of epoch checkpoint barrier.") + # # logger.info("Passed end of epoch checkpoint barrier.") - for callback in self._epoch_callbacks: - callback(self, metrics=metrics, epoch=epoch, is_master=self._master) + # for callback in self._epoch_callbacks: + # callback(self, metrics=metrics, epoch=epoch, is_master=self._master) - epoch_elapsed_time = time.time() - epoch_start_time - logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time)) + # epoch_elapsed_time = time.time() - epoch_start_time + # logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time)) - if epoch < self._num_epochs - 1: - training_elapsed_time = time.time() - training_start_time - estimated_time_remaining = training_elapsed_time * ( - (self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1 - ) - formatted_time = str(datetime.timedelta(seconds=int(estimated_time_remaining))) - logger.info("Estimated training time remaining: %s", formatted_time) + # if epoch < self._num_epochs - 1: + # training_elapsed_time = time.time() - training_start_time + # estimated_time_remaining = training_elapsed_time * ( + # (self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1 + # ) + # formatted_time = str(datetime.timedelta(seconds=int(estimated_time_remaining))) + # logger.info("Estimated training time remaining: %s", formatted_time) - epochs_trained += 1 + # epochs_trained += 1 - for callback in self._end_callbacks: - callback(self, metrics=metrics, epoch=epoch, is_master=self._master) + # for callback in self._end_callbacks: + # callback(self, metrics=metrics, epoch=epoch, is_master=self._master) - # Load the best model state before returning - best_model_state = ( - None if self._checkpointer is None else self._checkpointer.best_model_state() - ) - if best_model_state: - self.model.load_state_dict(best_model_state) + # # Load the best model state before returning + # best_model_state = ( + # None if self._checkpointer is None else self._checkpointer.best_model_state() + # ) + # if best_model_state: + # self.model.load_state_dict(best_model_state) - return metrics + # return metrics def _restore_checkpoint(self) -> int: """ diff --git a/tests/training/trainer_test.py b/tests/training/trainer_test.py index c138a13e787..ec078fa4139 100644 --- a/tests/training/trainer_test.py +++ b/tests/training/trainer_test.py @@ -1170,111 +1170,4 @@ def test_sparse_clip_grad(self): _ = clip_grad_norm_([embedding.weight], 1.5) # Final norm should be 1.5 grad = embedding.weight.grad.coalesce() - assert grad._values().norm(2.0).item() == pytest.approx(1.5, rel=1e-4) - - - -@requires_multi_gpu -class TestDeepspeedTrainer(TrainerTestBase): - @pytest.mark.parametrize( - "batch_size, num_gradient_accumulation_steps", [(32, 1)] - ) - def test_trainer_can_run_deepspeed(self, batch_size, num_gradient_accumulation_steps): - import torch.multiprocessing as mp - from allennlp.common import util as common_util - - node_rank = 0 - - device_ids = [int(d) for d in os.getenv('CUDA_VISIBLE_DEVICES').split(',')] - num_procs = len(device_ids) - - num_nodes = 1 - world_size = num_nodes * num_procs - - mp.spawn( - _test_worker, - args=( - self.TEST_DIR, - self.model, - self.optimizer, - self.data_loader, - batch_size, - num_gradient_accumulation_steps, - "127.0.0.1", - common_util.find_open_port(), - node_rank, - num_procs, - world_size, - ), - nprocs=num_procs, - ) - - -def _test_worker( - process_rank: int, - serialization_dir, - model, - optimizer, - data_loader, - batch_size, - num_gradient_accumulation_steps, - master_addr: str = "127.0.0.1", - master_port: int = 29500, - node_rank: int = 0, - num_procs_per_node: int = 1, - world_size: int = 1, -): - import torch.distributed as dist - from allennlp.commands.train import TrainModel - from allennlp.training.deepspeed.trainer import DeepspeedTrainer - from allennlp.training.deepspeed.config import ( - DeepspeedConfig, - DeepspeedArgs, - DeepspeedZeROConfig, - DeepspeedFP16Config - ) - - global_rank = node_rank * num_procs_per_node + process_rank - - # Number of processes per node is useful to know if a process - # is a master in the local node(node in which it is running) - os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node) - - backend = "nccl" if process_rank >= 0 else "gloo" - dist.init_process_group( - backend=backend, - init_method=f"tcp://{master_addr}:{master_port}", - world_size=world_size, - rank=global_rank, - ) - - engine = DeepspeedTrainer._build_engine( - model, - optimizer, - deepspeed_config=DeepspeedConfig( - zero_optimization=DeepspeedZeROConfig(), - fp16=DeepspeedFP16Config() - ), - args=DeepspeedArgs(local_rank=process_rank), - batch_size=batch_size, - num_gradient_accumulation_steps=num_gradient_accumulation_steps - ) - - trainer = DeepspeedTrainer( - model, - data_loader, - engine, - num_epochs=2, - num_gradient_accumulation_steps=num_gradient_accumulation_steps, - ) - train_loop = TrainModel( - serialization_dir=serialization_dir, - model=model, - trainer=trainer - ) - - _ = train_loop.run() - # metrics = train_loop.run() - # print(metrics) - import json - print(json.dumps(dict(os.environ), indent=2)) \ No newline at end of file + assert grad._values().norm(2.0).item() == pytest.approx(1.5, rel=1e-4) \ No newline at end of file From b152fe1a4853d598f15f5ff8f0d5c742bf342a78 Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Tue, 19 Jan 2021 11:31:36 -0500 Subject: [PATCH 17/20] catch up --- allennlp/training/deepspeed/checkpointer.py | 18 +- allennlp/training/deepspeed/trainer.py | 314 ++++++++++---------- 2 files changed, 175 insertions(+), 157 deletions(-) diff --git a/allennlp/training/deepspeed/checkpointer.py b/allennlp/training/deepspeed/checkpointer.py index a8433b2e150..4a4a55e6853 100644 --- a/allennlp/training/deepspeed/checkpointer.py +++ b/allennlp/training/deepspeed/checkpointer.py @@ -18,6 +18,21 @@ class DeepspeedCheckpointer(Checkpointer): + def manual_save( + self, + model_engine, + serialization_dir: str, + tag: str, + client_state={} + ): + # if self.save_non_zero_checkpoint: + model_engine._create_checkpoint_file(serialization_dir, tag, False) + model_engine._save_checkpoint(serialization_dir, tag, client_state=client_state) + + # if self.save_zero_checkpoint: + model_engine._create_zero_checkpoint_files(serialization_dir, tag) + model_engine._save_zero_checkpoint(serialization_dir, tag) + @overrides def save_checkpoint( self, @@ -36,7 +51,8 @@ def save_checkpoint( checkpoint_id = "deepspeed_epoch_{}".format(epoch) model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}".format(epoch)) - model_engine.save_checkpoint(self._serialization_dir, checkpoint_id) + # model_engine.save_checkpoint(self._serialization_dir, checkpoint_id) + self.manual_save(model_engine, self._serialization_dir, checkpoint_id) # TODO # Model will need a weight file to load; diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py index 553038f4654..7d38435b16f 100644 --- a/allennlp/training/deepspeed/trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -133,13 +133,13 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) logger.info(f"logging mem usg") cpu_memory_usage = [] - # for worker, memory in common_util.peak_cpu_memory().items(): - # cpu_memory_usage.append((worker, memory)) - # logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}") + for worker, memory in common_util.peak_cpu_memory().items(): + cpu_memory_usage.append((worker, memory)) + logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}") gpu_memory_usage = [] - # for gpu, memory in common_util.peak_gpu_memory().items(): - # gpu_memory_usage.append((gpu, memory)) - # logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") + for gpu, memory in common_util.peak_gpu_memory().items(): + gpu_memory_usage.append((gpu, memory)) + logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") logger.info(f"done logging mem usg") regularization_penalty = self.model.get_regularization_penalty() @@ -282,165 +282,167 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) return metrics - # def _try_train(self) -> Dict[str, Any]: - # try: - # epoch_counter = self._restore_checkpoint() - # except RuntimeError: - # traceback.print_exc() - # raise ConfigurationError( - # "Could not recover training from the checkpoint. Did you mean to output to " - # "a different serialization directory or delete the existing serialization " - # "directory?" - # ) - - # training_util.enable_gradient_clipping(self.model, self._grad_clipping) - - # logger.info("Beginning training.") - - # val_metrics: Dict[str, float] = {} - # this_epoch_val_metric: float = 0.0 - # metrics: Dict[str, Any] = {} - # epochs_trained = 0 - # training_start_time = time.time() - - # metrics["best_epoch"] = self._metric_tracker.best_epoch - # for key, value in self._metric_tracker.best_epoch_metrics.items(): - # metrics["best_validation_" + key] = value - - # for callback in self._epoch_callbacks: - # callback(self, metrics={}, epoch=-1, is_master=self._master) - - # for epoch in range(epoch_counter, self._num_epochs): - # epoch_start_time = time.time() - # logger.info("Training epoch.") - # train_metrics = self._train_epoch(epoch) - - # if self._master and self._checkpointer is not None: - # self._checkpointer.save_checkpoint(epoch, self, save_model_only=True) - - # # # Wait for the master to finish saving the model checkpoint - # if self._distributed: - # dist.barrier() + def _try_train(self) -> Dict[str, Any]: + try: + epoch_counter = self._restore_checkpoint() + except RuntimeError: + traceback.print_exc() + raise ConfigurationError( + "Could not recover training from the checkpoint. Did you mean to output to " + "a different serialization directory or delete the existing serialization " + "directory?" + ) + + training_util.enable_gradient_clipping(self.model, self._grad_clipping) + + logger.info("Beginning training.") + + val_metrics: Dict[str, float] = {} + this_epoch_val_metric: float = 0.0 + metrics: Dict[str, Any] = {} + epochs_trained = 0 + training_start_time = time.time() + + metrics["best_epoch"] = self._metric_tracker.best_epoch + for key, value in self._metric_tracker.best_epoch_metrics.items(): + metrics["best_validation_" + key] = value + + for callback in self._epoch_callbacks: + callback(self, metrics={}, epoch=-1, is_master=self._master) + + for epoch in range(epoch_counter, self._num_epochs): + epoch_start_time = time.time() + logger.info("Training epoch.") + train_metrics = self._train_epoch(epoch) + + # if self._master and self._checkpointer is not None: + if self._checkpointer is not None: + self._checkpointer.save_checkpoint(epoch, self, save_model_only=True) + + # # Wait for the master to finish saving the model checkpoint + if self._distributed: + dist.barrier() - # # logger.info("Passed start of epoch checkpoint barrier.") - - # # get peak of memory usage - # for key, value in train_metrics.items(): - # if key.startswith("gpu_") and key.endswith("_memory_MB"): - # metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) - # elif key.startswith("worker_") and key.endswith("_memory_MB"): - # metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) - - # if self._validation_data_loader is not None: - # with torch.no_grad(): - # # We have a validation set, so compute all the metrics on it. - # val_loss, val_reg_loss, num_batches = self._validation_loss(epoch) - - # # It is safe again to wait till the validation is done. This is - # # important to get the metrics right. - # if self._distributed: - # dist.barrier() - - # # logger.info("Passed validation metrics barrier.") - - # val_metrics = training_util.get_metrics( - # self.model, - # val_loss, - # val_reg_loss, - # batch_loss=None, - # batch_reg_loss=None, - # num_batches=num_batches, - # reset=True, - # world_size=self._world_size, - # cuda_device=self.cuda_device, - # ) - - # # Check validation metric for early stopping - # this_epoch_val_metric = val_metrics[self._validation_metric] - # self._metric_tracker.add_metric(this_epoch_val_metric) - - # if self._metric_tracker.should_stop_early(): - # logger.info("Ran out of patience. Stopping training.") - # break - - # if self._master: - # self._tensorboard.log_metrics( - # train_metrics, val_metrics=val_metrics, log_to_console=True, epoch=epoch + 1 - # ) # +1 because tensorboard doesn't like 0 - - # # Create overall metrics dict - # training_elapsed_time = time.time() - training_start_time - # metrics["training_duration"] = str(datetime.timedelta(seconds=training_elapsed_time)) - # metrics["training_start_epoch"] = epoch_counter - # metrics["training_epochs"] = epochs_trained - # metrics["epoch"] = epoch - - # for key, value in train_metrics.items(): - # metrics["training_" + key] = value - # for key, value in val_metrics.items(): - # metrics["validation_" + key] = value - - # if self._metric_tracker.is_best_so_far(): - # # Update all the best_ metrics. - # # (Otherwise they just stay the same as they were.) - # metrics["best_epoch"] = epoch - # for key, value in val_metrics.items(): - # metrics["best_validation_" + key] = value - - # self._metric_tracker.best_epoch_metrics = val_metrics - - # if self._serialization_dir and self._master: - # common_util.dump_metrics( - # os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), - # metrics, - # ) - - # # The Scheduler API is agnostic to whether your schedule requires a validation metric - - # # if it doesn't, the validation metric passed here is ignored. - # if self._learning_rate_scheduler: - # self._learning_rate_scheduler.step(this_epoch_val_metric) - # if self._momentum_scheduler: - # self._momentum_scheduler.step(this_epoch_val_metric) - - # if self._master and self._checkpointer is not None: - # self._checkpointer.save_checkpoint( - # epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() - # ) - - # # logger.info("Starting end of epoch checkpoint barrier...") - # # Wait for the master to finish saving the checkpoint - # if self._distributed: - # dist.barrier() + # logger.info("Passed start of epoch checkpoint barrier.") + + # get peak of memory usage + for key, value in train_metrics.items(): + if key.startswith("gpu_") and key.endswith("_memory_MB"): + metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) + elif key.startswith("worker_") and key.endswith("_memory_MB"): + metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) + + if self._validation_data_loader is not None: + with torch.no_grad(): + # We have a validation set, so compute all the metrics on it. + val_loss, val_reg_loss, num_batches = self._validation_loss(epoch) + + # It is safe again to wait till the validation is done. This is + # important to get the metrics right. + if self._distributed: + dist.barrier() + + # logger.info("Passed validation metrics barrier.") + + val_metrics = training_util.get_metrics( + self.model, + val_loss, + val_reg_loss, + batch_loss=None, + batch_reg_loss=None, + num_batches=num_batches, + reset=True, + world_size=self._world_size, + cuda_device=self.cuda_device, + ) + + # Check validation metric for early stopping + this_epoch_val_metric = val_metrics[self._validation_metric] + self._metric_tracker.add_metric(this_epoch_val_metric) + + if self._metric_tracker.should_stop_early(): + logger.info("Ran out of patience. Stopping training.") + break + + if self._master: + self._tensorboard.log_metrics( + train_metrics, val_metrics=val_metrics, log_to_console=True, epoch=epoch + 1 + ) # +1 because tensorboard doesn't like 0 + + # Create overall metrics dict + training_elapsed_time = time.time() - training_start_time + metrics["training_duration"] = str(datetime.timedelta(seconds=training_elapsed_time)) + metrics["training_start_epoch"] = epoch_counter + metrics["training_epochs"] = epochs_trained + metrics["epoch"] = epoch + + for key, value in train_metrics.items(): + metrics["training_" + key] = value + for key, value in val_metrics.items(): + metrics["validation_" + key] = value + + if self._metric_tracker.is_best_so_far(): + # Update all the best_ metrics. + # (Otherwise they just stay the same as they were.) + metrics["best_epoch"] = epoch + for key, value in val_metrics.items(): + metrics["best_validation_" + key] = value + + self._metric_tracker.best_epoch_metrics = val_metrics + + if self._serialization_dir and self._master: + common_util.dump_metrics( + os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), + metrics, + ) + + # The Scheduler API is agnostic to whether your schedule requires a validation metric - + # if it doesn't, the validation metric passed here is ignored. + if self._learning_rate_scheduler: + self._learning_rate_scheduler.step(this_epoch_val_metric) + if self._momentum_scheduler: + self._momentum_scheduler.step(this_epoch_val_metric) + + # if self._master and self._checkpointer is not None: + if self._checkpointer is not None: + self._checkpointer.save_checkpoint( + epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() + ) + + # logger.info("Starting end of epoch checkpoint barrier...") + # Wait for the master to finish saving the checkpoint + if self._distributed: + dist.barrier() - # # logger.info("Passed end of epoch checkpoint barrier.") + # logger.info("Passed end of epoch checkpoint barrier.") - # for callback in self._epoch_callbacks: - # callback(self, metrics=metrics, epoch=epoch, is_master=self._master) + for callback in self._epoch_callbacks: + callback(self, metrics=metrics, epoch=epoch, is_master=self._master) - # epoch_elapsed_time = time.time() - epoch_start_time - # logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time)) + epoch_elapsed_time = time.time() - epoch_start_time + logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time)) - # if epoch < self._num_epochs - 1: - # training_elapsed_time = time.time() - training_start_time - # estimated_time_remaining = training_elapsed_time * ( - # (self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1 - # ) - # formatted_time = str(datetime.timedelta(seconds=int(estimated_time_remaining))) - # logger.info("Estimated training time remaining: %s", formatted_time) + if epoch < self._num_epochs - 1: + training_elapsed_time = time.time() - training_start_time + estimated_time_remaining = training_elapsed_time * ( + (self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1 + ) + formatted_time = str(datetime.timedelta(seconds=int(estimated_time_remaining))) + logger.info("Estimated training time remaining: %s", formatted_time) - # epochs_trained += 1 + epochs_trained += 1 - # for callback in self._end_callbacks: - # callback(self, metrics=metrics, epoch=epoch, is_master=self._master) + for callback in self._end_callbacks: + callback(self, metrics=metrics, epoch=epoch, is_master=self._master) - # # Load the best model state before returning - # best_model_state = ( - # None if self._checkpointer is None else self._checkpointer.best_model_state() - # ) - # if best_model_state: - # self.model.load_state_dict(best_model_state) + # Load the best model state before returning + best_model_state = ( + None if self._checkpointer is None else self._checkpointer.best_model_state() + ) + if best_model_state: + self.model.load_state_dict(best_model_state) - # return metrics + return metrics def _restore_checkpoint(self) -> int: """ From 4fb66044bbaea33153435d822331674e173a6d88 Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Wed, 20 Jan 2021 11:24:08 -0500 Subject: [PATCH 18/20] moved master checks to checkpointer to accomodate deepspeed --- allennlp/training/checkpointer.py | 4 + allennlp/training/deepspeed/checkpointer.py | 106 +++----------------- allennlp/training/deepspeed/trainer.py | 19 ++-- allennlp/training/trainer.py | 4 +- tests/commands/train_test.py | 3 +- 5 files changed, 28 insertions(+), 108 deletions(-) diff --git a/allennlp/training/checkpointer.py b/allennlp/training/checkpointer.py index 1cf9e8b8cf2..2833f1a6721 100644 --- a/allennlp/training/checkpointer.py +++ b/allennlp/training/checkpointer.py @@ -76,6 +76,8 @@ def maybe_save_checkpoint( only looks at time, not batch or epoch number, though those parameters are available to you if you want to customize the behavior of this function. """ + if not trainer._master: + return if self._model_save_interval is None: return if time.time() - self._last_save_time < self._model_save_interval: @@ -92,6 +94,8 @@ def save_checkpoint( is_best_so_far: bool = False, save_model_only=False, ) -> None: + if not trainer._master: + return if self._serialization_dir is not None: with trainer.get_checkpoint_state() as state: model_state, training_states = state diff --git a/allennlp/training/deepspeed/checkpointer.py b/allennlp/training/deepspeed/checkpointer.py index 4a4a55e6853..b21c675362d 100644 --- a/allennlp/training/deepspeed/checkpointer.py +++ b/allennlp/training/deepspeed/checkpointer.py @@ -18,21 +18,6 @@ class DeepspeedCheckpointer(Checkpointer): - def manual_save( - self, - model_engine, - serialization_dir: str, - tag: str, - client_state={} - ): - # if self.save_non_zero_checkpoint: - model_engine._create_checkpoint_file(serialization_dir, tag, False) - model_engine._save_checkpoint(serialization_dir, tag, client_state=client_state) - - # if self.save_zero_checkpoint: - model_engine._create_zero_checkpoint_files(serialization_dir, tag) - model_engine._save_zero_checkpoint(serialization_dir, tag) - @overrides def save_checkpoint( self, @@ -43,78 +28,19 @@ def save_checkpoint( ) -> None: if self._serialization_dir is None: return + + super().save_checkpoint(epoch, trainer, is_best_so_far, save_model_only) - # logger.info("Getting checkpoint state") - with trainer.get_checkpoint_state() as state: - model_state, training_states = state - model_engine = trainer.model_engine - - checkpoint_id = "deepspeed_epoch_{}".format(epoch) - model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}".format(epoch)) - # model_engine.save_checkpoint(self._serialization_dir, checkpoint_id) - self.manual_save(model_engine, self._serialization_dir, checkpoint_id) - - # TODO - # Model will need a weight file to load; - # not sure if ZeRO stage 2 will mess this up - if not os.path.isfile(model_path): - torch.save(model_state, model_path) - # logger.info("Saved model state") - if save_model_only: - return - - training_path = os.path.join( - self._serialization_dir, "training_state_epoch_{}.th".format(epoch) - ) - if not os.path.isfile(training_path): - torch.save({**training_states, "epoch": epoch}, training_path) - - # The main checkpointing logic is now done, this is just shuffling files around, to keep - # track of best weights, and to remove old checkpoints, if desired. - if is_best_so_far: - logger.info( - "Best validation performance so far. Copying weights to '%s/best.th'.", - self._serialization_dir, - ) - shutil.copyfile(model_path, os.path.join(self._serialization_dir, "best.th")) + checkpoint_id = "deepspeed_epoch_{}".format(epoch) + model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}".format(epoch)) + trainer.model_engine.save_checkpoint(self._serialization_dir, checkpoint_id) + if trainer._master and is_best_so_far: engine_dir = os.path.join(self._serialization_dir, "best_deepspeed") shutil.rmtree(engine_dir, ignore_errors=True) # in case no previous checkpoints shutil.copytree(os.path.join(self._serialization_dir, checkpoint_id), engine_dir) - if ( - self._num_serialized_models_to_keep is not None - and self._num_serialized_models_to_keep >= 0 - ): - self._serialized_paths.append((time.time(), model_path, training_path)) - if len(self._serialized_paths) > self._num_serialized_models_to_keep: - paths_to_remove = self._serialized_paths.pop(0) - # Check to see if we should keep this checkpoint, if it has been longer - # then self._keep_serialized_model_every_num_seconds since the last - # kept checkpoint. - remove_path = True - if self._keep_serialized_model_every_num_seconds is not None: - save_time = paths_to_remove[0] - time_since_checkpoint_kept = ( - save_time - self._last_permanent_saved_checkpoint_time - ) - if time_since_checkpoint_kept > self._keep_serialized_model_every_num_seconds: - # We want to keep this checkpoint. - remove_path = False - self._last_permanent_saved_checkpoint_time = save_time - if remove_path: - for fname in paths_to_remove[1:]: - if os.path.isfile(fname): - os.remove(fname) - - @overrides - def find_latest_checkpoint(self) -> Optional[Tuple[str, str, str]]: - latest = super().find_latest_checkpoint() - if not latest: - return None - - model_path, training_state_path = latest - + def find_latest_deepspeed_checkpoint(self) -> Optional[str]: checkpoints: Iterable[Path] = ( self._serialization_dir and Path(self._serialization_dir).glob("deepspeed_epoch_*") ) or [] @@ -122,19 +48,11 @@ def find_latest_checkpoint(self) -> Optional[Tuple[str, str, str]]: if not checkpoints: return None - engine_path = checkpoints[-1] - return str(engine_path), model_path, training_state_path + engine_path = str(checkpoints[-1]) + return engine_path @overrides - def restore_checkpoint(self) -> Tuple[int, Dict[str, Any], Dict[str, Any]]: - latest_checkpoint = self.find_latest_checkpoint() - - if latest_checkpoint is None: - # No checkpoint to restore, start at 0 - return -1, {}, {} - - checkpoint_id, model_path, training_state_path = latest_checkpoint - - model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1)) - training_state = torch.load(training_state_path, map_location=nn_util.device_mapping(-1)) + def restore_checkpoint(self) -> Tuple[str, Dict[str, Any], Dict[str, Any]]: + model_state, training_state = super().restore_checkpoint() + checkpoint_id = self.find_latest_deepspeed_checkpoint() return checkpoint_id, model_state, training_state diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py index 7d38435b16f..f8920ff0a7a 100644 --- a/allennlp/training/deepspeed/trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -85,7 +85,7 @@ def __init__( epoch_callbacks=epoch_callbacks, end_callbacks=end_callbacks, trainer_callbacks=trainer_callbacks, - distributed=False, + distributed=False, # Avoid DDP init local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, @@ -129,9 +129,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ - print(f'Rank {self._rank}: Starting epoch') logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) - logger.info(f"logging mem usg") cpu_memory_usage = [] for worker, memory in common_util.peak_cpu_memory().items(): cpu_memory_usage.append((worker, memory)) @@ -140,7 +138,6 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: for gpu, memory in common_util.peak_gpu_memory().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") - logger.info(f"done logging mem usg") regularization_penalty = self.model.get_regularization_penalty() @@ -178,8 +175,8 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total - if not self._master: - print(f'Rank {self._rank}: {batch_num_total}') + # if not self._master: + # print(f'Rank {self._rank}: {batch_num_total}') batch_outputs = self.batch_outputs(batch, for_training=True) @@ -255,14 +252,14 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: is_master=self._master, ) - if not self._master: - print(f'Rank {self._rank}: {batches_this_epoch}') + # if not self._master: + # print(f'Rank {self._rank}: {batches_this_epoch}') if self._distributed: dist.barrier() - if not self._master: - print(f'Rank {self._rank}: Passed barrier') + # if not self._master: + # print(f'Rank {self._rank}: Passed barrier') metrics = training_util.get_metrics( self.model, @@ -282,7 +279,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) return metrics - def _try_train(self) -> Dict[str, Any]: + def __try_train(self) -> Dict[str, Any]: try: epoch_counter = self._restore_checkpoint() except RuntimeError: diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py index 1b3a889ac62..91e0e5d50c7 100644 --- a/allennlp/training/trainer.py +++ b/allennlp/training/trainer.py @@ -1000,7 +1000,7 @@ def _try_train(self) -> Dict[str, Any]: epoch_start_time = time.time() train_metrics = self._train_epoch(epoch) - if self._master and self._checkpointer is not None: + if self._checkpointer is not None: self._checkpointer.save_checkpoint(epoch, self, save_model_only=True) # Wait for the master to finish saving the model checkpoint @@ -1083,7 +1083,7 @@ def _try_train(self) -> Dict[str, Any]: if self._momentum_scheduler: self._momentum_scheduler.step(this_epoch_val_metric) - if self._master and self._checkpointer is not None: + if self._checkpointer is not None: self._checkpointer.save_checkpoint( epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() ) diff --git a/tests/commands/train_test.py b/tests/commands/train_test.py index 7e38c07be01..0c093af6515 100644 --- a/tests/commands/train_test.py +++ b/tests/commands/train_test.py @@ -16,7 +16,7 @@ from allennlp.commands.train import Train, train_model, train_model_from_args, TrainModel from allennlp.common import Params from allennlp.common.checks import ConfigurationError -from allennlp.common.testing import AllenNlpTestCase, cpu_or_gpu +from allennlp.common.testing import AllenNlpTestCase, cpu_or_gpu, requires_multi_gpu from allennlp.data import DatasetReader, Instance, Vocabulary from allennlp.data.dataloader import TensorDict from allennlp.models import load_archive, Model @@ -235,6 +235,7 @@ def test_train_model_distributed(self): # Check we can load the serialized model assert load_archive(out_dir).model + @requires_multi_gpu def test_train_model_deepspeed(self): if torch.cuda.device_count() >= 2: devices = [0, 1] From 703843c6f737242450436ee7e6fada2cb968186c Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Wed, 10 Feb 2021 17:07:59 -0500 Subject: [PATCH 19/20] updating to 2.0 --- CHANGELOG.md | 2 + allennlp/commands/__init__.py | 4 +- allennlp/commands/evaluate.py | 20 +- allennlp/commands/find_learning_rate.py | 14 +- allennlp/commands/predict.py | 17 +- allennlp/commands/subcommand.py | 5 +- allennlp/commands/train.py | 15 +- allennlp/common/cached_transformers.py | 4 +- allennlp/common/file_utils.py | 14 +- allennlp/common/from_params.py | 24 +- allennlp/common/plugins.py | 11 +- allennlp/common/testing/model_test_case.py | 22 +- allennlp/common/testing/test_case.py | 3 +- allennlp/common/util.py | 5 +- allennlp/data/data_loaders/__init__.py | 11 +- .../data_loaders/multiprocess_data_loader.py | 6 +- .../data_loaders/multitask_data_loader.py | 5 +- .../data/data_loaders/multitask_scheduler.py | 4 +- .../data/data_loaders/simple_data_loader.py | 6 +- allennlp/data/dataset_readers/__init__.py | 8 +- allennlp/data/dataset_readers/conll2003.py | 4 +- .../data/dataset_readers/dataset_reader.py | 5 +- .../dataset_readers/dataset_utils/__init__.py | 5 +- .../data/dataset_readers/sequence_tagging.py | 4 +- .../dataset_readers/sharded_dataset_reader.py | 4 +- .../text_classification_json.py | 4 +- allennlp/data/fields/adjacency_field.py | 4 +- allennlp/data/fields/field.py | 5 +- allennlp/data/fields/label_field.py | 5 +- allennlp/data/fields/list_field.py | 4 +- allennlp/data/fields/text_field.py | 4 +- allennlp/data/image_loader.py | 4 +- allennlp/data/token_indexers/__init__.py | 4 +- allennlp/data/token_indexers/elmo_indexer.py | 4 +- allennlp/data/tokenizers/__init__.py | 4 +- allennlp/data/vocabulary.py | 12 +- allennlp/interpret/__init__.py | 4 +- allennlp/interpret/attackers/hotflip.py | 23 +- .../interpret/attackers/input_reduction.py | 6 +- .../saliency_interpreters/__init__.py | 8 +- .../integrated_gradient.py | 4 +- .../saliency_interpreters/simple_gradient.py | 4 +- .../saliency_interpreters/smooth_gradient.py | 4 +- allennlp/models/basic_classifier.py | 7 +- allennlp/models/model.py | 4 +- allennlp/modules/attention/__init__.py | 4 +- allennlp/modules/attention/attention.py | 5 +- allennlp/modules/augmented_lstm.py | 36 +- allennlp/modules/backbones/__init__.py | 4 +- .../modules/backbones/vilbert_backbone.py | 6 +- allennlp/modules/bimpm_matching.py | 12 +- allennlp/modules/conditional_random_field.py | 5 +- allennlp/modules/elmo.py | 28 +- allennlp/modules/elmo_lstm.py | 17 +- allennlp/modules/matrix_attention/__init__.py | 16 +- .../linear_matrix_attention.py | 4 +- allennlp/modules/sampled_softmax_loss.py | 5 +- allennlp/modules/scalar_mix.py | 3 +- allennlp/modules/seq2seq_encoders/__init__.py | 4 +- .../seq2seq_encoders/gated_cnn_encoder.py | 7 +- .../pytorch_seq2seq_wrapper.py | 17 +- .../modules/seq2vec_encoders/bert_pooler.py | 5 +- .../seq2vec_encoders/cnn_highway_encoder.py | 5 +- .../pytorch_seq2vec_wrapper.py | 5 +- allennlp/modules/span_extractors/__init__.py | 4 +- .../bidirectional_endpoint_span_extractor.py | 9 +- .../endpoint_span_extractor.py | 3 +- .../modules/text_field_embedders/__init__.py | 4 +- allennlp/modules/token_embedders/__init__.py | 8 +- allennlp/modules/token_embedders/embedding.py | 23 +- .../pretrained_transformer_embedder.py | 25 +- ...trained_transformer_mismatched_embedder.py | 10 +- allennlp/modules/transformer/__init__.py | 9 +- .../transformer/bimodal_connection_layer.py | 5 +- .../modules/transformer/self_attention.py | 5 +- .../transformer/transformer_embeddings.py | 5 +- .../modules/transformer/transformer_layer.py | 8 +- .../modules/transformer/transformer_module.py | 5 +- .../modules/transformer/transformer_stack.py | 7 +- allennlp/modules/transformer/util.py | 3 +- allennlp/nn/beam_search.py | 30 +- allennlp/nn/chu_liu_edmonds.py | 16 +- allennlp/nn/initializers.py | 14 +- allennlp/nn/util.py | 29 +- allennlp/predictors/sentence_tagger.py | 5 +- allennlp/tools/archive_surgery.py | 5 +- .../create_elmo_embeddings_from_vocab.py | 12 +- allennlp/training/__init__.py | 1 - allennlp/training/checkpointer.py | 10 +- allennlp/training/deepspeed/__init__.py | 6 - allennlp/training/deepspeed/checkpointer.py | 10 +- allennlp/training/deepspeed/optimizers.py | 27 - allennlp/training/deepspeed/trainer.py | 273 ++-------- .../learning_rate_schedulers/__init__.py | 12 +- .../learning_rate_schedulers/combined.py | 4 +- .../learning_rate_schedulers/cosine.py | 4 +- .../learning_rate_scheduler.py | 17 +- .../linear_with_warmup.py | 4 +- .../training/learning_rate_schedulers/noam.py | 4 +- .../polynomial_decay.py | 4 +- .../slanted_triangular.py | 9 +- allennlp/training/metrics/auc.py | 9 +- .../training/metrics/categorical_accuracy.py | 3 +- .../metrics/evalb_bracketing_scorer.py | 6 +- allennlp/training/metrics/fbeta_measure.py | 6 +- allennlp/training/metrics/metric.py | 5 +- .../training/metrics/span_based_f1_measure.py | 4 +- .../training/metrics/spearman_correlation.py | 3 +- allennlp/training/moving_average.py | 3 +- allennlp/training/scheduler.py | 5 +- allennlp/training/tensorboard_writer.py | 19 +- allennlp/training/trainer.py | 4 +- allennlp/training/util.py | 13 +- scripts/ai2_internal/resume_daemon.py | 14 +- scripts/ai2_internal/run_with_beaker.py | 42 +- scripts/build_docs_config.py | 3 +- scripts/close_stale_issues.py | 7 +- scripts/py2md.py | 5 +- scripts/train_fixtures.py | 3 +- setup.py | 1 + tests/commands/cached_path_test.py | 16 +- tests/commands/evaluate_test.py | 8 +- tests/commands/find_learning_rate_test.py | 21 +- tests/commands/main_test.py | 8 +- tests/commands/predict_test.py | 33 +- tests/commands/print_results_test.py | 6 +- tests/commands/train_test.py | 170 +++++-- tests/common/file_utils_test.py | 26 +- tests/common/from_params_test.py | 30 +- tests/common/params_test.py | 5 +- tests/common/util_test.py | 16 +- .../multiprocess_data_loader_test.py | 11 +- .../multitask_data_loader_test.py | 5 +- tests/data/dataloader_test.py | 46 ++ .../data/dataset_readers/babi_reader_test.py | 11 +- .../dataset_readers/dataset_reader_test.py | 466 +++++++++++++----- .../dataset_utils/span_utils_test.py | 3 +- .../interleaving_dataset_reader_test.py | 10 +- .../lazy_dataset_reader_test.py | 62 +++ .../dataset_readers/sequence_tagging_test.py | 16 +- .../sharded_dataset_reader_test.py | 22 +- .../text_classification_json_test.py | 35 +- tests/data/fields/array_field_test.py | 115 +++++ tests/data/fields/list_field_test.py | 13 +- .../samplers/bucket_batch_sampler_test.py | 59 ++- .../samplers/max_tokens_batch_sampler_test.py | 49 +- tests/data/samplers/sampler_test.py | 15 +- .../pretrained_transformer_indexer_test.py | 2 +- .../letters_digits_tokenizer_test.py | 3 +- tests/data/tokenizers/spacy_tokenizer_test.py | 3 +- tests/data/vocabulary_test.py | 16 - tests/models/archival_test.py | 10 +- .../scaled_dot_product_attention_test.py | 4 +- tests/modules/elmo_test.py | 7 +- .../pytorch_transformer_wrapper_test.py | 70 +-- .../pretrained_transformer_embedder_test.py | 15 - tests/modules/transformer/toolkit_test.py | 16 +- .../transformer_embeddings_test.py | 22 +- tests/nn/util_test.py | 23 +- .../slanted_triangular_test.py | 9 +- tests/training/optimizer_test.py | 20 +- tests/training/trainer_test.py | 24 +- 162 files changed, 1959 insertions(+), 874 deletions(-) create mode 100644 tests/data/dataloader_test.py create mode 100644 tests/data/dataset_readers/lazy_dataset_reader_test.py create mode 100644 tests/data/fields/array_field_test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 03a0e5bc0ee..bd5ae19222d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +- Added `DeepspeedTrainer` and `FusedLambOptimizer`. + ### Changed - `coding_scheme` parameter is now deprecated in `Conll2003DatasetReader`, please use `convert_to_coding_scheme` instead. diff --git a/allennlp/commands/__init__.py b/allennlp/commands/__init__.py index 3a0fba2232f..561f9137a19 100644 --- a/allennlp/commands/__init__.py +++ b/allennlp/commands/__init__.py @@ -50,7 +50,9 @@ def add_argument(self, *args, **kwargs): super().add_argument(*args, **kwargs) -def parse_args(prog: Optional[str] = None) -> Tuple[argparse.ArgumentParser, argparse.Namespace]: +def parse_args( + prog: Optional[str] = None, +) -> Tuple[argparse.ArgumentParser, argparse.Namespace]: """ Creates the argument parser for the main program and uses it to parse the args. """ diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py index d0b0692f857..63e4d23838e 100644 --- a/allennlp/commands/evaluate.py +++ b/allennlp/commands/evaluate.py @@ -27,17 +27,23 @@ class Evaluate(Subcommand): def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.ArgumentParser: description = """Evaluate the specified model + dataset""" subparser = parser.add_parser( - self.name, description=description, help="Evaluate the specified model + dataset." + self.name, + description=description, + help="Evaluate the specified model + dataset.", ) subparser.add_argument("archive_file", type=str, help="path to an archived trained model") subparser.add_argument( - "input_file", type=str, help="path to the file containing the evaluation data" + "input_file", + type=str, + help="path to the file containing the evaluation data", ) subparser.add_argument( - "--output-file", type=str, help="optional path to write the metrics to as JSON" + "--output-file", + type=str, + help="optional path to write the metrics to as JSON", ) subparser.add_argument( @@ -47,7 +53,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ) subparser.add_argument( - "--weights-file", type=str, help="a path that overrides which weights file to use" + "--weights-file", + type=str, + help="a path that overrides which weights file to use", ) cuda_device = subparser.add_mutually_exclusive_group(required=False) @@ -68,7 +76,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ) subparser.add_argument( - "--batch-size", type=int, help="If non-empty, the batch size to use during evaluation." + "--batch-size", + type=int, + help="If non-empty, the batch size to use during evaluation.", ) subparser.add_argument( diff --git a/allennlp/commands/find_learning_rate.py b/allennlp/commands/find_learning_rate.py index 8a1f6380ed4..853a408070c 100644 --- a/allennlp/commands/find_learning_rate.py +++ b/allennlp/commands/find_learning_rate.py @@ -39,7 +39,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ) subparser.add_argument( - "param_path", type=str, help="path to parameter file describing the model to be trained" + "param_path", + type=str, + help="path to parameter file describing the model to be trained", ) subparser.add_argument( "-s", @@ -60,10 +62,16 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ), ) subparser.add_argument( - "--start-lr", type=float, default=1e-5, help="learning rate to start the search" + "--start-lr", + type=float, + default=1e-5, + help="learning rate to start the search", ) subparser.add_argument( - "--end-lr", type=float, default=10, help="learning rate up to which search is done" + "--end-lr", + type=float, + default=10, + help="learning rate up to which search is done", ) subparser.add_argument( "--num-batches", diff --git a/allennlp/commands/predict.py b/allennlp/commands/predict.py index 24ac891b9a4..8742828d7f2 100644 --- a/allennlp/commands/predict.py +++ b/allennlp/commands/predict.py @@ -28,7 +28,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument description = """Run the specified model against a JSON-lines input file.""" subparser = parser.add_parser( - self.name, description=description, help="Use a trained model to make predictions." + self.name, + description=description, + help="Use a trained model to make predictions.", ) subparser.add_argument( @@ -38,12 +40,17 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument subparser.add_argument("--output-file", type=str, help="path to output file") subparser.add_argument( - "--weights-file", type=str, help="a path that overrides which weights file to use" + "--weights-file", + type=str, + help="a path that overrides which weights file to use", ) batch_size = subparser.add_mutually_exclusive_group(required=False) batch_size.add_argument( - "--batch-size", type=int, default=1, help="The batch size to use for processing" + "--batch-size", + type=int, + default=1, + help="The batch size to use for processing", ) subparser.add_argument( @@ -86,7 +93,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ) subparser.add_argument( - "--predictor", type=str, help="optionally specify a specific predictor to use" + "--predictor", + type=str, + help="optionally specify a specific predictor to use", ) subparser.add_argument( diff --git a/allennlp/commands/subcommand.py b/allennlp/commands/subcommand.py index 3efdef3e71e..ee327b22f33 100644 --- a/allennlp/commands/subcommand.py +++ b/allennlp/commands/subcommand.py @@ -39,7 +39,10 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument @classmethod @overrides def register( - cls: Type[T], name: str, constructor: Optional[str] = None, exist_ok: bool = False + cls: Type[T], + name: str, + constructor: Optional[str] = None, + exist_ok: bool = False, ) -> Callable[[Type[T]], Type[T]]: super_register_fn = super().register(name, constructor=constructor, exist_ok=exist_ok) diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py index 33d5df63acb..c32c792ed1b 100644 --- a/allennlp/commands/train.py +++ b/allennlp/commands/train.py @@ -24,7 +24,11 @@ from allennlp.common.plugins import import_plugins from allennlp.data import DatasetReader, Vocabulary from allennlp.data import DataLoader -from allennlp.models.archival import archive_model, CONFIG_NAME, verify_include_in_archive +from allennlp.models.archival import ( + archive_model, + CONFIG_NAME, + verify_include_in_archive, +) from allennlp.models.model import _DEFAULT_WEIGHTS, Model from allennlp.training.trainer import Trainer from allennlp.training import util as training_util @@ -40,7 +44,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument subparser = parser.add_parser(self.name, description=description, help="Train a model.") subparser.add_argument( - "param_path", type=str, help="path to parameter file describing the model to be trained" + "param_path", + type=str, + help="path to parameter file describing the model to be trained", ) subparser.add_argument( @@ -80,7 +86,10 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ) subparser.add_argument( - "--node-rank", type=int, default=0, help="rank of this node in the distributed setup" + "--node-rank", + type=int, + default=0, + help="rank of this node in the distributed setup", ) subparser.add_argument( diff --git a/allennlp/common/cached_transformers.py b/allennlp/common/cached_transformers.py index e3e700af8a2..bbdf20ff9df 100644 --- a/allennlp/common/cached_transformers.py +++ b/allennlp/common/cached_transformers.py @@ -66,7 +66,9 @@ def strip_prefix(s): } if len(valid_keys) > 0: logger.info( - "Loading %d tensors from %s", len(valid_keys), override_weights_file + "Loading %d tensors from %s", + len(valid_keys), + override_weights_file, ) else: raise ValueError( diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py index ecb252fc5e2..2aaef669c6f 100644 --- a/allennlp/common/file_utils.py +++ b/allennlp/common/file_utils.py @@ -651,7 +651,10 @@ class CacheFile: """ def __init__( - self, cache_filename: Union[PathLike, str], mode: str = "w+b", suffix: str = ".tmp" + self, + cache_filename: Union[PathLike, str], + mode: str = "w+b", + suffix: str = ".tmp", ) -> None: self.cache_filename = ( cache_filename if isinstance(cache_filename, Path) else Path(cache_filename) @@ -670,7 +673,9 @@ def __exit__(self, exc_type, exc_value, traceback): if exc_value is None: # Success. logger.debug( - "Renaming temp file %s to cache at %s", self.temp_file.name, self.cache_filename + "Renaming temp file %s to cache at %s", + self.temp_file.name, + self.cache_filename, ) # Rename the temp file to the actual cache filename. os.replace(self.temp_file.name, self.cache_filename) @@ -921,7 +926,10 @@ def get_file_extension(path: str, dot=True, lower: bool = True): def open_compressed( - filename: Union[str, PathLike], mode: str = "rt", encoding: Optional[str] = "UTF-8", **kwargs + filename: Union[str, PathLike], + mode: str = "rt", + encoding: Optional[str] = "UTF-8", + **kwargs, ): if not isinstance(filename, str): filename = str(filename) diff --git a/allennlp/common/from_params.py b/allennlp/common/from_params.py index 6db2de629b2..be2787518f7 100644 --- a/allennlp/common/from_params.py +++ b/allennlp/common/from_params.py @@ -86,7 +86,9 @@ def is_base_registrable(cls) -> bool: Checks whether this is a class that directly inherits from Registrable, or is a subclass of such a class. """ - from allennlp.common.registrable import Registrable # import here to avoid circular imports + from allennlp.common.registrable import ( + Registrable, + ) # import here to avoid circular imports if not issubclass(cls, Registrable): return False @@ -148,7 +150,10 @@ def infer_params( else: super_parameters = {} - return {**super_parameters, **parameters} # Subclass parameters overwrite superclass ones + return { + **super_parameters, + **parameters, + } # Subclass parameters overwrite superclass ones def create_kwargs( @@ -245,7 +250,12 @@ def create_extras(cls: Type[T], extras: Dict[str, Any]) -> Dict[str, Any]: def pop_and_construct_arg( - class_name: str, argument_name: str, annotation: Type, default: Any, params: Params, **extras + class_name: str, + argument_name: str, + annotation: Type, + default: Any, + params: Params, + **extras, ) -> Any: """ Does the work of actually constructing an individual argument for @@ -261,7 +271,9 @@ def pop_and_construct_arg( `inspect.Parameter` object directly, so that we can handle `Union` types using recursion on this method, trying the different annotation types in the union in turn. """ - from allennlp.models.archival import load_archive # import here to avoid circular imports + from allennlp.models.archival import ( + load_archive, + ) # import here to avoid circular imports # We used `argument_name` as the method argument to avoid conflicts with 'name' being a key in # `extras`, which isn't _that_ unlikely. Now that we are inside the method, we can switch back @@ -536,7 +548,9 @@ def from_params( constructor (because you inspect `__init__`, but call `cls()`). """ - from allennlp.common.registrable import Registrable # import here to avoid circular imports + from allennlp.common.registrable import ( + Registrable, + ) # import here to avoid circular imports logger.debug( f"instantiating class {cls} from params {getattr(params, 'params', params)} " diff --git a/allennlp/common/plugins.py b/allennlp/common/plugins.py index e114631f3ab..45f7596ad08 100644 --- a/allennlp/common/plugins.py +++ b/allennlp/common/plugins.py @@ -33,14 +33,21 @@ The global plugins file will be found here. """ -DEFAULT_PLUGINS = ("allennlp_models", "allennlp_semparse", "allennlp_server") +DEFAULT_PLUGINS = ( + "allennlp_models", + "allennlp_semparse", + "allennlp_server", + "allennlp.training.deepspeed", +) """ Default plugins do not need to be declared in a plugins file. They will always be imported when they are installed in the current Python environment. """ -def discover_file_plugins(plugins_filename: str = LOCAL_PLUGINS_FILENAME) -> Iterable[str]: +def discover_file_plugins( + plugins_filename: str = LOCAL_PLUGINS_FILENAME, +) -> Iterable[str]: """ Returns an iterable of the plugins found, declared within a file whose path is `plugins_filename`. """ diff --git a/allennlp/common/testing/model_test_case.py b/allennlp/common/testing/model_test_case.py index d920152b2fc..5594493b480 100644 --- a/allennlp/common/testing/model_test_case.py +++ b/allennlp/common/testing/model_test_case.py @@ -54,7 +54,9 @@ def set_up_model( self.vocab = vocab self.instances = instances self.model = Model.from_params( - vocab=self.vocab, params=params["model"], serialization_dir=serialization_dir + vocab=self.vocab, + params=params["model"], + serialization_dir=serialization_dir, ) # TODO(joelgrus) get rid of these @@ -149,13 +151,17 @@ def ensure_model_can_train_save_and_load( print("Reading with original model") data_loader = DataLoader.from_params( - params=data_loader_params, reader=reader, data_path=params["validation_data_path"] + params=data_loader_params, + reader=reader, + data_path=params["validation_data_path"], ) data_loader.index_with(model.vocab) print("Reading with loaded model") data_loader2 = DataLoader.from_params( - params=data_loader_params2, reader=reader, data_path=params["validation_data_path"] + params=data_loader_params2, + reader=reader, + data_path=params["validation_data_path"], ) data_loader2.index_with(loaded_model.vocab) @@ -193,7 +199,10 @@ def ensure_model_can_train_save_and_load( # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal( - model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance + model_predictions[key], + loaded_model_predictions[key], + name=key, + tolerance=tolerance, ) # Check loaded model's loss exists and we can compute gradients, for continuing training. @@ -277,7 +286,10 @@ def assert_fields_equal(self, field1, field2, name: str, tolerance: float = 1e-6 assert field1.keys() == field2.keys() for key in field1: self.assert_fields_equal( - field1[key], field2[key], tolerance=tolerance, name=name + "." + str(key) + field1[key], + field2[key], + tolerance=tolerance, + name=name + "." + str(key), ) elif isinstance(field1, (list, tuple)): assert len(field1) == len(field2) diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py index 9f466e8ee6b..56291f70976 100644 --- a/allennlp/common/testing/test_case.py +++ b/allennlp/common/testing/test_case.py @@ -23,7 +23,8 @@ class AllenNlpTestCase: def setup_method(self): logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.DEBUG + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + level=logging.DEBUG, ) # Disabling some of the more verbose logging statements that typically aren't very helpful # in tests. diff --git a/allennlp/common/util.py b/allennlp/common/util.py index eddb7600ffd..de4ed6461c9 100644 --- a/allennlp/common/util.py +++ b/allennlp/common/util.py @@ -466,7 +466,10 @@ def int_to_device(device: Union[int, torch.device]) -> torch.device: def log_frozen_and_tunable_parameter_names(model: torch.nn.Module) -> None: - frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(model) + ( + frozen_parameter_names, + tunable_parameter_names, + ) = get_frozen_and_tunable_parameter_names(model) logger.info("The following parameters are Frozen (without gradient):") for name in frozen_parameter_names: diff --git a/allennlp/data/data_loaders/__init__.py b/allennlp/data/data_loaders/__init__.py index 8c2dfe8776c..ce94ed8dd69 100644 --- a/allennlp/data/data_loaders/__init__.py +++ b/allennlp/data/data_loaders/__init__.py @@ -1,4 +1,11 @@ -from allennlp.data.data_loaders.data_loader import DataLoader, TensorDict, allennlp_collate -from allennlp.data.data_loaders.multiprocess_data_loader import MultiProcessDataLoader, WorkerError +from allennlp.data.data_loaders.data_loader import ( + DataLoader, + TensorDict, + allennlp_collate, +) +from allennlp.data.data_loaders.multiprocess_data_loader import ( + MultiProcessDataLoader, + WorkerError, +) from allennlp.data.data_loaders.multitask_data_loader import MultiTaskDataLoader from allennlp.data.data_loaders.simple_data_loader import SimpleDataLoader diff --git a/allennlp/data/data_loaders/multiprocess_data_loader.py b/allennlp/data/data_loaders/multiprocess_data_loader.py index d0681bc0c78..e170592c0a5 100644 --- a/allennlp/data/data_loaders/multiprocess_data_loader.py +++ b/allennlp/data/data_loaders/multiprocess_data_loader.py @@ -12,7 +12,11 @@ from allennlp.common.util import lazy_groups_of, shuffle_iterable from allennlp.common.tqdm import Tqdm from allennlp.data.instance import Instance -from allennlp.data.data_loaders.data_loader import DataLoader, TensorDict, allennlp_collate +from allennlp.data.data_loaders.data_loader import ( + DataLoader, + TensorDict, + allennlp_collate, +) from allennlp.data.dataset_readers import DatasetReader, WorkerInfo, DatasetReaderInput from allennlp.data.fields import TextField from allennlp.data.samplers import BatchSampler diff --git a/allennlp/data/data_loaders/multitask_data_loader.py b/allennlp/data/data_loaders/multitask_data_loader.py index 222bd7d8324..9047f2d3efe 100644 --- a/allennlp/data/data_loaders/multitask_data_loader.py +++ b/allennlp/data/data_loaders/multitask_data_loader.py @@ -6,7 +6,10 @@ from overrides import overrides from allennlp.common import util -from allennlp.data.dataset_readers.dataset_reader import DatasetReader, DatasetReaderInput +from allennlp.data.dataset_readers.dataset_reader import ( + DatasetReader, + DatasetReaderInput, +) from allennlp.data.batch import Batch from allennlp.data.data_loaders.data_loader import DataLoader, TensorDict from allennlp.data.data_loaders.multiprocess_data_loader import MultiProcessDataLoader diff --git a/allennlp/data/data_loaders/multitask_scheduler.py b/allennlp/data/data_loaders/multitask_scheduler.py index f77d070f498..044ba57669f 100644 --- a/allennlp/data/data_loaders/multitask_scheduler.py +++ b/allennlp/data/data_loaders/multitask_scheduler.py @@ -71,7 +71,9 @@ def batch_instances( self, epoch_instances: Dict[str, Iterable[Instance]] ) -> Iterable[List[Instance]]: return _chunked_iterator( - more_itertools.roundrobin(*epoch_instances.values()), self.batch_size, self.drop_last + more_itertools.roundrobin(*epoch_instances.values()), + self.batch_size, + self.drop_last, ) def count_batches(self, dataset_counts: Dict[str, int]) -> int: diff --git a/allennlp/data/data_loaders/simple_data_loader.py b/allennlp/data/data_loaders/simple_data_loader.py index 26b66b30893..9c77021e16f 100644 --- a/allennlp/data/data_loaders/simple_data_loader.py +++ b/allennlp/data/data_loaders/simple_data_loader.py @@ -6,7 +6,11 @@ import torch from allennlp.common.util import lazy_groups_of -from allennlp.data.data_loaders.data_loader import DataLoader, allennlp_collate, TensorDict +from allennlp.data.data_loaders.data_loader import ( + DataLoader, + allennlp_collate, + TensorDict, +) from allennlp.data.dataset_readers import DatasetReader from allennlp.data.instance import Instance from allennlp.data.vocabulary import Vocabulary diff --git a/allennlp/data/dataset_readers/__init__.py b/allennlp/data/dataset_readers/__init__.py index 274d9d7e4ee..72f9ba3f9f3 100644 --- a/allennlp/data/dataset_readers/__init__.py +++ b/allennlp/data/dataset_readers/__init__.py @@ -14,8 +14,12 @@ ) from allennlp.data.dataset_readers.babi import BabiReader from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader -from allennlp.data.dataset_readers.interleaving_dataset_reader import InterleavingDatasetReader +from allennlp.data.dataset_readers.interleaving_dataset_reader import ( + InterleavingDatasetReader, +) from allennlp.data.dataset_readers.multitask import MultiTaskDatasetReader from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader from allennlp.data.dataset_readers.sharded_dataset_reader import ShardedDatasetReader -from allennlp.data.dataset_readers.text_classification_json import TextClassificationJsonReader +from allennlp.data.dataset_readers.text_classification_json import ( + TextClassificationJsonReader, +) diff --git a/allennlp/data/dataset_readers/conll2003.py b/allennlp/data/dataset_readers/conll2003.py index 19ca273c258..d2bbffe36d8 100644 --- a/allennlp/data/dataset_readers/conll2003.py +++ b/allennlp/data/dataset_readers/conll2003.py @@ -109,7 +109,9 @@ def __init__( convert_to_coding_scheme = coding_scheme super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + manual_distributed_sharding=True, + manual_multiprocess_sharding=True, + **kwargs, ) self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if tag_label is not None and tag_label not in self._VALID_LABELS: diff --git a/allennlp/data/dataset_readers/dataset_reader.py b/allennlp/data/dataset_readers/dataset_reader.py index 58614160b81..dfce75a033c 100644 --- a/allennlp/data/dataset_readers/dataset_reader.py +++ b/allennlp/data/dataset_readers/dataset_reader.py @@ -368,7 +368,10 @@ def _multi_worker_islice( UserWarning, ) sharded_slice = itertools.islice( - sharded_slice, self._worker_info.id, None, self._worker_info.num_workers + sharded_slice, + self._worker_info.id, + None, + self._worker_info.num_workers, ) if max_instances is not None: diff --git a/allennlp/data/dataset_readers/dataset_utils/__init__.py b/allennlp/data/dataset_readers/dataset_utils/__init__.py index 4af41f46ce9..56d972e0c29 100644 --- a/allennlp/data/dataset_readers/dataset_utils/__init__.py +++ b/allennlp/data/dataset_readers/dataset_utils/__init__.py @@ -1,4 +1,7 @@ from allennlp.data.dataset_readers.dataset_utils.span_utils import enumerate_spans from allennlp.data.dataset_readers.dataset_utils.span_utils import bio_tags_to_spans -from allennlp.data.dataset_readers.dataset_utils.span_utils import to_bioul, iob1_to_bioul +from allennlp.data.dataset_readers.dataset_utils.span_utils import ( + to_bioul, + iob1_to_bioul, +) from allennlp.data.dataset_readers.dataset_utils.span_utils import bioul_tags_to_spans diff --git a/allennlp/data/dataset_readers/sequence_tagging.py b/allennlp/data/dataset_readers/sequence_tagging.py index 40f03a5d6de..0de82b82614 100644 --- a/allennlp/data/dataset_readers/sequence_tagging.py +++ b/allennlp/data/dataset_readers/sequence_tagging.py @@ -50,7 +50,9 @@ def __init__( **kwargs, ) -> None: super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + manual_distributed_sharding=True, + manual_multiprocess_sharding=True, + **kwargs, ) self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self._word_tag_delimiter = word_tag_delimiter diff --git a/allennlp/data/dataset_readers/sharded_dataset_reader.py b/allennlp/data/dataset_readers/sharded_dataset_reader.py index 2976bb332eb..0f1505d1d3c 100644 --- a/allennlp/data/dataset_readers/sharded_dataset_reader.py +++ b/allennlp/data/dataset_readers/sharded_dataset_reader.py @@ -38,7 +38,9 @@ class ShardedDatasetReader(DatasetReader): def __init__(self, base_reader: DatasetReader, **kwargs) -> None: super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + manual_distributed_sharding=True, + manual_multiprocess_sharding=True, + **kwargs, ) self.reader = base_reader # We have to make the base reader think that it's the only worker so that it doesn't diff --git a/allennlp/data/dataset_readers/text_classification_json.py b/allennlp/data/dataset_readers/text_classification_json.py index 81d1a80ebfc..5dea99685ea 100644 --- a/allennlp/data/dataset_readers/text_classification_json.py +++ b/allennlp/data/dataset_readers/text_classification_json.py @@ -56,7 +56,9 @@ def __init__( **kwargs, ) -> None: super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + manual_distributed_sharding=True, + manual_multiprocess_sharding=True, + **kwargs, ) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences diff --git a/allennlp/data/fields/adjacency_field.py b/allennlp/data/fields/adjacency_field.py index cf45cf8cf98..a6d2ffe6f28 100644 --- a/allennlp/data/fields/adjacency_field.py +++ b/allennlp/data/fields/adjacency_field.py @@ -135,7 +135,9 @@ def empty_field(self) -> "AdjacencyField": # The empty_list here is needed for mypy empty_list: List[Tuple[int, int]] = [] adjacency_field = AdjacencyField( - empty_list, self.sequence_field.empty_field(), padding_value=self._padding_value + empty_list, + self.sequence_field.empty_field(), + padding_value=self._padding_value, ) return adjacency_field diff --git a/allennlp/data/fields/field.py b/allennlp/data/fields/field.py index 40842293e26..a52fde6961b 100644 --- a/allennlp/data/fields/field.py +++ b/allennlp/data/fields/field.py @@ -6,7 +6,10 @@ from allennlp.data.vocabulary import Vocabulary DataArray = TypeVar( - "DataArray", torch.Tensor, Dict[str, torch.Tensor], Dict[str, Dict[str, torch.Tensor]] + "DataArray", + torch.Tensor, + Dict[str, torch.Tensor], + Dict[str, Dict[str, torch.Tensor]], ) diff --git a/allennlp/data/fields/label_field.py b/allennlp/data/fields/label_field.py index 06ebf47c579..bfc13e1145a 100644 --- a/allennlp/data/fields/label_field.py +++ b/allennlp/data/fields/label_field.py @@ -46,7 +46,10 @@ class LabelField(Field[torch.Tensor]): _already_warned_namespaces: Set[str] = set() def __init__( - self, label: Union[str, int], label_namespace: str = "labels", skip_indexing: bool = False + self, + label: Union[str, int], + label_namespace: str = "labels", + skip_indexing: bool = False, ) -> None: self.label = label self._label_namespace = label_namespace diff --git a/allennlp/data/fields/list_field.py b/allennlp/data/fields/list_field.py index 0a77a75a5d8..166d4f69b37 100644 --- a/allennlp/data/fields/list_field.py +++ b/allennlp/data/fields/list_field.py @@ -86,7 +86,9 @@ def sequence_length(self) -> int: @overrides def as_tensor(self, padding_lengths: Dict[str, int]) -> DataArray: padded_field_list = pad_sequence_to_length( - self.field_list, padding_lengths["num_fields"], self.field_list[0].empty_field + self.field_list, + padding_lengths["num_fields"], + self.field_list[0].empty_field, ) # Here we're removing the scoping on the padding length keys that we added in # `get_padding_lengths`; see the note there for more detail. diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py index 9d171223fd6..6997a6d0715 100644 --- a/allennlp/data/fields/text_field.py +++ b/allennlp/data/fields/text_field.py @@ -45,7 +45,9 @@ class TextField(SequenceField[TextFieldTensors]): __slots__ = ["tokens", "_token_indexers", "_indexed_tokens"] def __init__( - self, tokens: List[Token], token_indexers: Optional[Dict[str, TokenIndexer]] = None + self, + tokens: List[Token], + token_indexers: Optional[Dict[str, TokenIndexer]] = None, ) -> None: self.tokens = tokens self._token_indexers = token_indexers diff --git a/allennlp/data/image_loader.py b/allennlp/data/image_loader.py index f5f081763c6..cc7c5c1e724 100644 --- a/allennlp/data/image_loader.py +++ b/allennlp/data/image_loader.py @@ -69,7 +69,9 @@ def __call__(self, filename_or_filenames: Union[OnePath, ManyPaths]) -> ImagesWi size = cast( IntTensor, torch.tensor( - [image.shape[-2], image.shape[-1]], dtype=torch.int32, device=self.device + [image.shape[-2], image.shape[-1]], + dtype=torch.int32, + device=self.device, ), ) images.append(image) diff --git a/allennlp/data/token_indexers/__init__.py b/allennlp/data/token_indexers/__init__.py index 912c6bd57f5..07849db674f 100644 --- a/allennlp/data/token_indexers/__init__.py +++ b/allennlp/data/token_indexers/__init__.py @@ -7,7 +7,9 @@ from allennlp.data.token_indexers.token_indexer import TokenIndexer from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer from allennlp.data.token_indexers.spacy_indexer import SpacyTokenIndexer -from allennlp.data.token_indexers.pretrained_transformer_indexer import PretrainedTransformerIndexer +from allennlp.data.token_indexers.pretrained_transformer_indexer import ( + PretrainedTransformerIndexer, +) from allennlp.data.token_indexers.pretrained_transformer_mismatched_indexer import ( PretrainedTransformerMismatchedIndexer, ) diff --git a/allennlp/data/token_indexers/elmo_indexer.py b/allennlp/data/token_indexers/elmo_indexer.py index c5e6c37d910..5167a8814a7 100644 --- a/allennlp/data/token_indexers/elmo_indexer.py +++ b/allennlp/data/token_indexers/elmo_indexer.py @@ -153,7 +153,9 @@ def padding_token(): tensor_dict["elmo_tokens"] = torch.LongTensor( pad_sequence_to_length( - tokens["elmo_tokens"], padding_lengths["elmo_tokens"], default_value=padding_token + tokens["elmo_tokens"], + padding_lengths["elmo_tokens"], + default_value=padding_token, ) ) return tensor_dict diff --git a/allennlp/data/tokenizers/__init__.py b/allennlp/data/tokenizers/__init__.py index d2501600f81..aa19c0f5eb3 100644 --- a/allennlp/data/tokenizers/__init__.py +++ b/allennlp/data/tokenizers/__init__.py @@ -7,7 +7,9 @@ from allennlp.data.tokenizers.tokenizer import Tokenizer from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer from allennlp.data.tokenizers.letters_digits_tokenizer import LettersDigitsTokenizer -from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer +from allennlp.data.tokenizers.pretrained_transformer_tokenizer import ( + PretrainedTransformerTokenizer, +) from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer from allennlp.data.tokenizers.sentence_splitter import SentenceSplitter from allennlp.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py index eca12a1495b..89aa17e1da6 100644 --- a/allennlp/data/vocabulary.py +++ b/allennlp/data/vocabulary.py @@ -10,7 +10,17 @@ import re from collections import defaultdict from transformers import PreTrainedTokenizer -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Union, TYPE_CHECKING +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Set, + Union, + TYPE_CHECKING, +) from allennlp.common import Registrable from allennlp.common.file_utils import cached_path, FileLock diff --git a/allennlp/interpret/__init__.py b/allennlp/interpret/__init__.py index 3111d8ee6bf..b45c9b3fa83 100644 --- a/allennlp/interpret/__init__.py +++ b/allennlp/interpret/__init__.py @@ -1,2 +1,4 @@ from allennlp.interpret.attackers.attacker import Attacker -from allennlp.interpret.saliency_interpreters.saliency_interpreter import SaliencyInterpreter +from allennlp.interpret.saliency_interpreters.saliency_interpreter import ( + SaliencyInterpreter, +) diff --git a/allennlp/interpret/attackers/hotflip.py b/allennlp/interpret/attackers/hotflip.py index a9d15db7615..bd7cddf3f01 100644 --- a/allennlp/interpret/attackers/hotflip.py +++ b/allennlp/interpret/attackers/hotflip.py @@ -18,7 +18,17 @@ from allennlp.nn import util from allennlp.predictors.predictor import Predictor -DEFAULT_IGNORE_TOKENS = ["@@NULL@@", ".", ",", ";", "!", "?", "[MASK]", "[SEP]", "[CLS]"] +DEFAULT_IGNORE_TOKENS = [ + "@@NULL@@", + ".", + ",", + ";", + "!", + "?", + "[MASK]", + "[SEP]", + "[CLS]", +] @Attacker.register("hotflip") @@ -57,7 +67,10 @@ class Hotflip(Attacker): """ def __init__( - self, predictor: Predictor, vocab_namespace: str = "tokens", max_tokens: int = 5000 + self, + predictor: Predictor, + vocab_namespace: str = "tokens", + max_tokens: int = 5000, ) -> None: super().__init__(predictor) self.vocab = self.predictor._model.vocab @@ -230,7 +243,11 @@ def attack_from_json( final_outputs.append(outputs) return sanitize( - {"final": final_tokens, "original": original_tokens, "outputs": final_outputs} + { + "final": final_tokens, + "original": original_tokens, + "outputs": final_outputs, + } ) def attack_instance( diff --git a/allennlp/interpret/attackers/input_reduction.py b/allennlp/interpret/attackers/input_reduction.py index b098c858fb7..72702dd5e69 100644 --- a/allennlp/interpret/attackers/input_reduction.py +++ b/allennlp/interpret/attackers/input_reduction.py @@ -51,7 +51,11 @@ def attack_from_json( for instance in original_instances: final_tokens.append( self._attack_instance( - inputs, instance, input_field_to_attack, grad_input_field, ignore_tokens + inputs, + instance, + input_field_to_attack, + grad_input_field, + ignore_tokens, ) ) return sanitize({"final": final_tokens, "original": original_tokens}) diff --git a/allennlp/interpret/saliency_interpreters/__init__.py b/allennlp/interpret/saliency_interpreters/__init__.py index 1fc08d2ec9d..911bf97cdc8 100644 --- a/allennlp/interpret/saliency_interpreters/__init__.py +++ b/allennlp/interpret/saliency_interpreters/__init__.py @@ -1,4 +1,8 @@ -from allennlp.interpret.saliency_interpreters.saliency_interpreter import SaliencyInterpreter +from allennlp.interpret.saliency_interpreters.saliency_interpreter import ( + SaliencyInterpreter, +) from allennlp.interpret.saliency_interpreters.simple_gradient import SimpleGradient -from allennlp.interpret.saliency_interpreters.integrated_gradient import IntegratedGradient +from allennlp.interpret.saliency_interpreters.integrated_gradient import ( + IntegratedGradient, +) from allennlp.interpret.saliency_interpreters.smooth_gradient import SmoothGradient diff --git a/allennlp/interpret/saliency_interpreters/integrated_gradient.py b/allennlp/interpret/saliency_interpreters/integrated_gradient.py index 8b1f8f0af26..5d353a46bc6 100644 --- a/allennlp/interpret/saliency_interpreters/integrated_gradient.py +++ b/allennlp/interpret/saliency_interpreters/integrated_gradient.py @@ -6,7 +6,9 @@ from allennlp.common.util import JsonDict, sanitize from allennlp.data import Instance -from allennlp.interpret.saliency_interpreters.saliency_interpreter import SaliencyInterpreter +from allennlp.interpret.saliency_interpreters.saliency_interpreter import ( + SaliencyInterpreter, +) from allennlp.nn import util diff --git a/allennlp/interpret/saliency_interpreters/simple_gradient.py b/allennlp/interpret/saliency_interpreters/simple_gradient.py index 639da42e824..ffe5f622a43 100644 --- a/allennlp/interpret/saliency_interpreters/simple_gradient.py +++ b/allennlp/interpret/saliency_interpreters/simple_gradient.py @@ -5,7 +5,9 @@ import torch from allennlp.common.util import JsonDict, sanitize -from allennlp.interpret.saliency_interpreters.saliency_interpreter import SaliencyInterpreter +from allennlp.interpret.saliency_interpreters.saliency_interpreter import ( + SaliencyInterpreter, +) from allennlp.nn import util diff --git a/allennlp/interpret/saliency_interpreters/smooth_gradient.py b/allennlp/interpret/saliency_interpreters/smooth_gradient.py index 7f088fa4789..55688b47c2a 100644 --- a/allennlp/interpret/saliency_interpreters/smooth_gradient.py +++ b/allennlp/interpret/saliency_interpreters/smooth_gradient.py @@ -6,7 +6,9 @@ from allennlp.common.util import JsonDict, sanitize from allennlp.data import Instance -from allennlp.interpret.saliency_interpreters.saliency_interpreter import SaliencyInterpreter +from allennlp.interpret.saliency_interpreters.saliency_interpreter import ( + SaliencyInterpreter, +) from allennlp.predictors import Predictor diff --git a/allennlp/models/basic_classifier.py b/allennlp/models/basic_classifier.py index 26602ccca1f..11e02853eb8 100644 --- a/allennlp/models/basic_classifier.py +++ b/allennlp/models/basic_classifier.py @@ -5,7 +5,12 @@ from allennlp.data import TextFieldTensors, Vocabulary from allennlp.models.model import Model -from allennlp.modules import FeedForward, Seq2SeqEncoder, Seq2VecEncoder, TextFieldEmbedder +from allennlp.modules import ( + FeedForward, + Seq2SeqEncoder, + Seq2VecEncoder, + TextFieldEmbedder, +) from allennlp.nn import InitializerApplicator, util from allennlp.nn.util import get_text_field_mask from allennlp.training.metrics import CategoricalAccuracy diff --git a/allennlp/models/model.py b/allennlp/models/model.py index 68aff36f9ad..07604a1bda1 100644 --- a/allennlp/models/model.py +++ b/allennlp/models/model.py @@ -445,7 +445,9 @@ def from_archive(cls, archive_file: str, vocab: Vocabulary = None) -> "Model": If `vocab` is given, we will extend the loaded model's vocabulary using the passed vocab object (including calling `extend_embedder_vocab`, which extends embedding layers). """ - from allennlp.models.archival import load_archive # here to avoid circular imports + from allennlp.models.archival import ( + load_archive, + ) # here to avoid circular imports model = load_archive(archive_file).model if vocab: diff --git a/allennlp/modules/attention/__init__.py b/allennlp/modules/attention/__init__.py index ba9ba3ad021..a82a7c445af 100644 --- a/allennlp/modules/attention/__init__.py +++ b/allennlp/modules/attention/__init__.py @@ -4,4 +4,6 @@ from allennlp.modules.attention.cosine_attention import CosineAttention from allennlp.modules.attention.dot_product_attention import DotProductAttention from allennlp.modules.attention.linear_attention import LinearAttention -from allennlp.modules.attention.scaled_dot_product_attention import ScaledDotProductAttention +from allennlp.modules.attention.scaled_dot_product_attention import ( + ScaledDotProductAttention, +) diff --git a/allennlp/modules/attention/attention.py b/allennlp/modules/attention/attention.py index 1c525bf3bc4..9c2ba1d5c6b 100644 --- a/allennlp/modules/attention/attention.py +++ b/allennlp/modules/attention/attention.py @@ -40,7 +40,10 @@ def __init__(self, normalize: bool = True) -> None: @overrides def forward( - self, vector: torch.Tensor, matrix: torch.Tensor, matrix_mask: torch.BoolTensor = None + self, + vector: torch.Tensor, + matrix: torch.Tensor, + matrix_mask: torch.BoolTensor = None, ) -> torch.Tensor: similarities = self._forward_internal(vector, matrix) if self._normalize: diff --git a/allennlp/modules/augmented_lstm.py b/allennlp/modules/augmented_lstm.py index 93757aa6125..da55ffb6d30 100644 --- a/allennlp/modules/augmented_lstm.py +++ b/allennlp/modules/augmented_lstm.py @@ -38,7 +38,11 @@ class AugmentedLSTMCell(torch.nn.Module): """ def __init__( - self, embed_dim: int, lstm_dim: int, use_highway: bool = True, use_bias: bool = True + self, + embed_dim: int, + lstm_dim: int, + use_highway: bool = True, + use_bias: bool = True, ): super().__init__() self.embed_dim = embed_dim @@ -121,7 +125,13 @@ def forward( if self.use_highway: fused_op = projected_input[:, : 5 * self.lstm_dim] + projected_state fused_chunked = torch.chunk(fused_op, 5, 1) - (input_gate, forget_gate, memory_init, output_gate, highway_gate) = fused_chunked + ( + input_gate, + forget_gate, + memory_init, + output_gate, + highway_gate, + ) = fused_chunked highway_gate = torch.sigmoid(highway_gate) else: fused_op = projected_input + projected_state @@ -199,7 +209,9 @@ def __init__( ) def forward( - self, inputs: PackedSequence, states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + self, + inputs: PackedSequence, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> Tuple[PackedSequence, Tuple[torch.Tensor, torch.Tensor]]: """ Warning: Would be better to use the BiAugmentedLstm class in a regular model @@ -385,7 +397,9 @@ def __init__( self.representation_dim = lstm_embed_dim def forward( - self, inputs: torch.Tensor, states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + self, + inputs: torch.Tensor, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Given an input batch of sequential data such as word embeddings, produces @@ -423,7 +437,9 @@ def forward( return self._forward_unidirectional(inputs, states) def _forward_bidirectional( - self, inputs: PackedSequence, states: Optional[Tuple[torch.Tensor, torch.Tensor]] + self, + inputs: PackedSequence, + states: Optional[Tuple[torch.Tensor, torch.Tensor]], ): output_sequence = inputs final_h = [] @@ -439,7 +455,8 @@ def _forward_bidirectional( else: hidden_states = list( zip( # type: ignore - states[0].chunk(self.num_layers, 0), states[1].chunk(self.num_layers, 0) + states[0].chunk(self.num_layers, 0), + states[1].chunk(self.num_layers, 0), ) ) for i, state in enumerate(hidden_states): @@ -473,7 +490,9 @@ def _forward_bidirectional( return output_sequence, final_state_tuple def _forward_unidirectional( - self, inputs: PackedSequence, states: Optional[Tuple[torch.Tensor, torch.Tensor]] + self, + inputs: PackedSequence, + states: Optional[Tuple[torch.Tensor, torch.Tensor]], ): output_sequence = inputs final_h = [] @@ -489,7 +508,8 @@ def _forward_unidirectional( else: hidden_states = list( zip( # type: ignore - states[0].chunk(self.num_layers, 0), states[1].chunk(self.num_layers, 0) + states[0].chunk(self.num_layers, 0), + states[1].chunk(self.num_layers, 0), ) # type: ignore ) diff --git a/allennlp/modules/backbones/__init__.py b/allennlp/modules/backbones/__init__.py index 050d67fd2e1..2738ebcec6b 100644 --- a/allennlp/modules/backbones/__init__.py +++ b/allennlp/modules/backbones/__init__.py @@ -1,3 +1,5 @@ from allennlp.modules.backbones.backbone import Backbone -from allennlp.modules.backbones.pretrained_transformer_backbone import PretrainedTransformerBackbone +from allennlp.modules.backbones.pretrained_transformer_backbone import ( + PretrainedTransformerBackbone, +) from allennlp.modules.backbones.vilbert_backbone import VilbertBackbone diff --git a/allennlp/modules/backbones/vilbert_backbone.py b/allennlp/modules/backbones/vilbert_backbone.py index 99f790d1896..e9f18fdd339 100644 --- a/allennlp/modules/backbones/vilbert_backbone.py +++ b/allennlp/modules/backbones/vilbert_backbone.py @@ -7,7 +7,11 @@ from allennlp.data.fields.text_field import TextFieldTensors from allennlp.data.vocabulary import Vocabulary from allennlp.modules.backbones.backbone import Backbone -from allennlp.modules.transformer import BiModalEncoder, ImageFeatureEmbeddings, Embeddings +from allennlp.modules.transformer import ( + BiModalEncoder, + ImageFeatureEmbeddings, + Embeddings, +) logger = logging.getLogger(__name__) diff --git a/allennlp/modules/bimpm_matching.py b/allennlp/modules/bimpm_matching.py index 6d75ede67bf..b69dd0c71d4 100644 --- a/allennlp/modules/bimpm_matching.py +++ b/allennlp/modules/bimpm_matching.py @@ -170,7 +170,9 @@ def create_parameter(): # utility function to create and initialize a parameter torch.nn.init.kaiming_normal_(param) return param - def share_or_create(weights_to_share): # utility function to create or share the weights + def share_or_create( + weights_to_share, + ): # utility function to create or share the weights return weights_to_share if share_weights_between_directions else create_parameter() output_dim = ( @@ -322,10 +324,14 @@ def forward( matching_vector_max, mask_2.unsqueeze(-2).unsqueeze(-1), dim=2 ) matching_vector_2_max = masked_max( - matching_vector_max.permute(0, 2, 1, 3), mask_1.unsqueeze(-2).unsqueeze(-1), dim=2 + matching_vector_max.permute(0, 2, 1, 3), + mask_1.unsqueeze(-2).unsqueeze(-1), + dim=2, ) matching_vector_2_mean = masked_mean( - matching_vector_max.permute(0, 2, 1, 3), mask_1.unsqueeze(-2).unsqueeze(-1), dim=2 + matching_vector_max.permute(0, 2, 1, 3), + mask_1.unsqueeze(-2).unsqueeze(-1), + dim=2, ) matching_vector_1.extend([matching_vector_1_max, matching_vector_1_mean]) diff --git a/allennlp/modules/conditional_random_field.py b/allennlp/modules/conditional_random_field.py index c40e19359b7..7efe523bfe1 100644 --- a/allennlp/modules/conditional_random_field.py +++ b/allennlp/modules/conditional_random_field.py @@ -34,7 +34,10 @@ def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tu num_labels = len(labels) start_tag = num_labels end_tag = num_labels + 1 - labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")] + labels_with_boundaries = list(labels.items()) + [ + (start_tag, "START"), + (end_tag, "END"), + ] allowed = [] for from_label_index, from_label in labels_with_boundaries: diff --git a/allennlp/modules/elmo.py b/allennlp/modules/elmo.py index 1061a8fbdc4..4aca5e1040c 100644 --- a/allennlp/modules/elmo.py +++ b/allennlp/modules/elmo.py @@ -196,9 +196,10 @@ def forward( processed_representation = representation_with_bos_eos processed_mask = mask_with_bos_eos else: - representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( - representation_with_bos_eos, mask_with_bos_eos - ) + ( + representation_without_bos_eos, + mask_without_bos_eos, + ) = remove_sentence_boundaries(representation_with_bos_eos, mask_with_bos_eos) processed_representation = representation_without_bos_eos processed_mask = mask_without_bos_eos representations.append(self._dropout(processed_representation)) @@ -336,14 +337,18 @@ def forward(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]: # Add BOS/EOS mask = (inputs > 0).sum(dim=-1) > 0 character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( - inputs, mask, self._beginning_of_sentence_characters, self._end_of_sentence_characters + inputs, + mask, + self._beginning_of_sentence_characters, + self._end_of_sentence_characters, ) # the character id embedding max_chars_per_token = self._options["char_cnn"]["max_characters_per_token"] # (batch_size * sequence_length, max_chars_per_token, embed_dim) character_embedding = torch.nn.functional.embedding( - character_ids_with_bos_eos.view(-1, max_chars_per_token), self._char_embedding_weights + character_ids_with_bos_eos.view(-1, max_chars_per_token), + self._char_embedding_weights, ) # run convolutions @@ -394,7 +399,8 @@ def _load_char_embedding(self): char_embed_weights = fin["char_embed"][...] weights = numpy.zeros( - (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), dtype="float32" + (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), + dtype="float32", ) weights[1:, :] = char_embed_weights @@ -410,7 +416,10 @@ def _load_cnn_weights(self): convolutions = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d( - in_channels=char_embed_dim, out_channels=num, kernel_size=width, bias=True + in_channels=char_embed_dim, + out_channels=num, + kernel_size=width, + bias=True, ) # load the weights with h5py.File(cached_path(self._weight_file), "r") as fin: @@ -583,7 +592,10 @@ def forward( embedded_inputs = self._word_embedding(word_inputs) # type: ignore # shape (batch_size, timesteps + 2, embedding_dim) type_representation, mask = add_sentence_boundary_token_ids( - embedded_inputs, mask_without_bos_eos, self._bos_embedding, self._eos_embedding + embedded_inputs, + mask_without_bos_eos, + self._bos_embedding, + self._eos_embedding, ) except (RuntimeError, IndexError): # Back off to running the character convolutions, diff --git a/allennlp/modules/elmo_lstm.py b/allennlp/modules/elmo_lstm.py index ca89a3fa571..9147b23a8dd 100644 --- a/allennlp/modules/elmo_lstm.py +++ b/allennlp/modules/elmo_lstm.py @@ -126,11 +126,18 @@ def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: where the num_layers dimension represents the LSTM output from that layer. """ batch_size, total_sequence_length = mask.size() - stacked_sequence_output, final_states, restoration_indices = self.sort_and_run_forward( - self._lstm_forward, inputs, mask - ) - - num_layers, num_valid, returned_timesteps, encoder_dim = stacked_sequence_output.size() + ( + stacked_sequence_output, + final_states, + restoration_indices, + ) = self.sort_and_run_forward(self._lstm_forward, inputs, mask) + + ( + num_layers, + num_valid, + returned_timesteps, + encoder_dim, + ) = stacked_sequence_output.size() # Add back invalid rows which were removed in the call to sort_and_run_forward. if num_valid < batch_size: zeros = stacked_sequence_output.new_zeros( diff --git a/allennlp/modules/matrix_attention/__init__.py b/allennlp/modules/matrix_attention/__init__.py index 4807383db9d..52fdf323d4c 100644 --- a/allennlp/modules/matrix_attention/__init__.py +++ b/allennlp/modules/matrix_attention/__init__.py @@ -1,5 +1,13 @@ from allennlp.modules.matrix_attention.matrix_attention import MatrixAttention -from allennlp.modules.matrix_attention.bilinear_matrix_attention import BilinearMatrixAttention -from allennlp.modules.matrix_attention.cosine_matrix_attention import CosineMatrixAttention -from allennlp.modules.matrix_attention.dot_product_matrix_attention import DotProductMatrixAttention -from allennlp.modules.matrix_attention.linear_matrix_attention import LinearMatrixAttention +from allennlp.modules.matrix_attention.bilinear_matrix_attention import ( + BilinearMatrixAttention, +) +from allennlp.modules.matrix_attention.cosine_matrix_attention import ( + CosineMatrixAttention, +) +from allennlp.modules.matrix_attention.dot_product_matrix_attention import ( + DotProductMatrixAttention, +) +from allennlp.modules.matrix_attention.linear_matrix_attention import ( + LinearMatrixAttention, +) diff --git a/allennlp/modules/matrix_attention/linear_matrix_attention.py b/allennlp/modules/matrix_attention/linear_matrix_attention.py index 1184b848198..a321a0154b1 100644 --- a/allennlp/modules/matrix_attention/linear_matrix_attention.py +++ b/allennlp/modules/matrix_attention/linear_matrix_attention.py @@ -70,6 +70,8 @@ def reset_parameters(self): @overrides def forward(self, matrix_1: torch.Tensor, matrix_2: torch.Tensor) -> torch.Tensor: combined_tensors = util.combine_tensors_and_multiply( - self._combination, [matrix_1.unsqueeze(2), matrix_2.unsqueeze(1)], self._weight_vector + self._combination, + [matrix_1.unsqueeze(2), matrix_2.unsqueeze(1)], + self._weight_vector, ) return self._activation(combined_tensors + self._bias) diff --git a/allennlp/modules/sampled_softmax_loss.py b/allennlp/modules/sampled_softmax_loss.py index 88e82975c25..6942db08fe3 100644 --- a/allennlp/modules/sampled_softmax_loss.py +++ b/allennlp/modules/sampled_softmax_loss.py @@ -163,7 +163,10 @@ def forward( return self._forward_train(embeddings, targets, target_token_embedding) def _forward_train( - self, embeddings: torch.Tensor, targets: torch.Tensor, target_token_embedding: torch.Tensor + self, + embeddings: torch.Tensor, + targets: torch.Tensor, + target_token_embedding: torch.Tensor, ) -> torch.Tensor: # (target_token_embedding is only used in the tie_embeddings case, diff --git a/allennlp/modules/scalar_mix.py b/allennlp/modules/scalar_mix.py index 4c003a8837a..8b5d069aa3d 100644 --- a/allennlp/modules/scalar_mix.py +++ b/allennlp/modules/scalar_mix.py @@ -38,7 +38,8 @@ def __init__( self.scalar_parameters = ParameterList( [ Parameter( - torch.FloatTensor([initial_scalar_parameters[i]]), requires_grad=trainable + torch.FloatTensor([initial_scalar_parameters[i]]), + requires_grad=trainable, ) for i in range(mixture_size) ] diff --git a/allennlp/modules/seq2seq_encoders/__init__.py b/allennlp/modules/seq2seq_encoders/__init__.py index 3cacdc80f9a..207effc1041 100644 --- a/allennlp/modules/seq2seq_encoders/__init__.py +++ b/allennlp/modules/seq2seq_encoders/__init__.py @@ -33,4 +33,6 @@ StackedBidirectionalLstmSeq2SeqEncoder, ) from allennlp.modules.seq2seq_encoders.seq2seq_encoder import Seq2SeqEncoder -from allennlp.modules.seq2seq_encoders.pytorch_transformer_wrapper import PytorchTransformer +from allennlp.modules.seq2seq_encoders.pytorch_transformer_wrapper import ( + PytorchTransformer, +) diff --git a/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py b/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py index 398255a2aa5..b648ef6d011 100644 --- a/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py +++ b/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py @@ -28,7 +28,12 @@ def __init__( if len(layer) == 2: # no dilation conv = torch.nn.Conv1d( - last_dim, layer[1] * 2, layer[0], stride=1, padding=layer[0] - 1, bias=True + last_dim, + layer[1] * 2, + layer[0], + stride=1, + padding=layer[0] - 1, + bias=True, ) elif len(layer) == 3: # a dilation diff --git a/allennlp/modules/seq2seq_encoders/pytorch_seq2seq_wrapper.py b/allennlp/modules/seq2seq_encoders/pytorch_seq2seq_wrapper.py index 3f36be0dedd..56ef1e543a6 100644 --- a/allennlp/modules/seq2seq_encoders/pytorch_seq2seq_wrapper.py +++ b/allennlp/modules/seq2seq_encoders/pytorch_seq2seq_wrapper.py @@ -69,7 +69,10 @@ def is_bidirectional(self) -> bool: @overrides def forward( - self, inputs: torch.Tensor, mask: torch.BoolTensor, hidden_state: torch.Tensor = None + self, + inputs: torch.Tensor, + mask: torch.BoolTensor, + hidden_state: torch.Tensor = None, ) -> torch.Tensor: if self.stateful and mask is None: @@ -82,9 +85,11 @@ def forward( batch_size, total_sequence_length = mask.size() - packed_sequence_output, final_states, restoration_indices = self.sort_and_run_forward( - self._module, inputs, mask, hidden_state - ) + ( + packed_sequence_output, + final_states, + restoration_indices, + ) = self.sort_and_run_forward(self._module, inputs, mask, hidden_state) unpacked_sequence_tensor, _ = pad_packed_sequence(packed_sequence_output, batch_first=True) @@ -116,7 +121,9 @@ def forward( sequence_length_difference = total_sequence_length - unpacked_sequence_tensor.size(1) if sequence_length_difference > 0: zeros = unpacked_sequence_tensor.new_zeros( - batch_size, sequence_length_difference, unpacked_sequence_tensor.size(-1) + batch_size, + sequence_length_difference, + unpacked_sequence_tensor.size(-1), ) unpacked_sequence_tensor = torch.cat([unpacked_sequence_tensor, zeros], 1) diff --git a/allennlp/modules/seq2vec_encoders/bert_pooler.py b/allennlp/modules/seq2vec_encoders/bert_pooler.py index 7509807f49f..0401193fc8b 100644 --- a/allennlp/modules/seq2vec_encoders/bert_pooler.py +++ b/allennlp/modules/seq2vec_encoders/bert_pooler.py @@ -77,7 +77,10 @@ def get_output_dim(self) -> int: return self._embedding_dim def forward( - self, tokens: torch.Tensor, mask: torch.BoolTensor = None, num_wrapping_dims: int = 0 + self, + tokens: torch.Tensor, + mask: torch.BoolTensor = None, + num_wrapping_dims: int = 0, ): pooler = self.pooler for _ in range(num_wrapping_dims): diff --git a/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py b/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py index 95d96912aa3..b53b3d9419a 100644 --- a/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py +++ b/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py @@ -67,7 +67,10 @@ def __init__( self._convolutions: List[torch.nn.Module] = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d( - in_channels=embedding_dim, out_channels=num, kernel_size=width, bias=True + in_channels=embedding_dim, + out_channels=num, + kernel_size=width, + bias=True, ) conv.weight.data.uniform_(-0.05, 0.05) conv.bias.data.fill_(0.0) diff --git a/allennlp/modules/seq2vec_encoders/pytorch_seq2vec_wrapper.py b/allennlp/modules/seq2vec_encoders/pytorch_seq2vec_wrapper.py index a52445fa8c1..de9e8cc2ee0 100644 --- a/allennlp/modules/seq2vec_encoders/pytorch_seq2vec_wrapper.py +++ b/allennlp/modules/seq2vec_encoders/pytorch_seq2vec_wrapper.py @@ -61,7 +61,10 @@ def get_output_dim(self) -> int: return self._module.hidden_size * (2 if is_bidirectional else 1) def forward( - self, inputs: torch.Tensor, mask: torch.BoolTensor, hidden_state: torch.Tensor = None + self, + inputs: torch.Tensor, + mask: torch.BoolTensor, + hidden_state: torch.Tensor = None, ) -> torch.Tensor: if mask is None: diff --git a/allennlp/modules/span_extractors/__init__.py b/allennlp/modules/span_extractors/__init__.py index 7a29d5aebdf..ba421ced7f8 100644 --- a/allennlp/modules/span_extractors/__init__.py +++ b/allennlp/modules/span_extractors/__init__.py @@ -1,5 +1,7 @@ from allennlp.modules.span_extractors.span_extractor import SpanExtractor -from allennlp.modules.span_extractors.endpoint_span_extractor import EndpointSpanExtractor +from allennlp.modules.span_extractors.endpoint_span_extractor import ( + EndpointSpanExtractor, +) from allennlp.modules.span_extractors.self_attentive_span_extractor import ( SelfAttentiveSpanExtractor, ) diff --git a/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py b/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py index e63a19b53aa..7545f03a7b0 100644 --- a/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py +++ b/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py @@ -96,7 +96,8 @@ def __init__( self._span_width_embedding: Optional[Embedding] = None if num_width_embeddings is not None and span_width_embedding_dim is not None: self._span_width_embedding = Embedding( - num_embeddings=num_width_embeddings, embedding_dim=span_width_embedding_dim + num_embeddings=num_width_embeddings, + embedding_dim=span_width_embedding_dim, ) elif num_width_embeddings is not None or span_width_embedding_dim is not None: raise ConfigurationError( @@ -229,11 +230,13 @@ def forward( # respective combinations and concatenate these representations. # Shape (batch_size, num_spans, forward_combination_dim) forward_spans = util.combine_tensors( - self._forward_combination, [forward_start_embeddings, forward_end_embeddings] + self._forward_combination, + [forward_start_embeddings, forward_end_embeddings], ) # Shape (batch_size, num_spans, backward_combination_dim) backward_spans = util.combine_tensors( - self._backward_combination, [backward_start_embeddings, backward_end_embeddings] + self._backward_combination, + [backward_start_embeddings, backward_end_embeddings], ) # Shape (batch_size, num_spans, forward_combination_dim + backward_combination_dim) span_embeddings = torch.cat([forward_spans, backward_spans], -1) diff --git a/allennlp/modules/span_extractors/endpoint_span_extractor.py b/allennlp/modules/span_extractors/endpoint_span_extractor.py index 86b19cb4a7e..f327566fade 100644 --- a/allennlp/modules/span_extractors/endpoint_span_extractor.py +++ b/allennlp/modules/span_extractors/endpoint_span_extractor.py @@ -74,7 +74,8 @@ def __init__( self._span_width_embedding: Optional[Embedding] = None if num_width_embeddings is not None and span_width_embedding_dim is not None: self._span_width_embedding = Embedding( - num_embeddings=num_width_embeddings, embedding_dim=span_width_embedding_dim + num_embeddings=num_width_embeddings, + embedding_dim=span_width_embedding_dim, ) elif num_width_embeddings is not None or span_width_embedding_dim is not None: raise ConfigurationError( diff --git a/allennlp/modules/text_field_embedders/__init__.py b/allennlp/modules/text_field_embedders/__init__.py index 9feb1eee972..75a8f982572 100644 --- a/allennlp/modules/text_field_embedders/__init__.py +++ b/allennlp/modules/text_field_embedders/__init__.py @@ -4,4 +4,6 @@ """ from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder -from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder +from allennlp.modules.text_field_embedders.basic_text_field_embedder import ( + BasicTextFieldEmbedder, +) diff --git a/allennlp/modules/token_embedders/__init__.py b/allennlp/modules/token_embedders/__init__.py index 8d1492ac362..d317930c1d2 100644 --- a/allennlp/modules/token_embedders/__init__.py +++ b/allennlp/modules/token_embedders/__init__.py @@ -5,13 +5,17 @@ from allennlp.modules.token_embedders.token_embedder import TokenEmbedder from allennlp.modules.token_embedders.embedding import Embedding -from allennlp.modules.token_embedders.token_characters_encoder import TokenCharactersEncoder +from allennlp.modules.token_embedders.token_characters_encoder import ( + TokenCharactersEncoder, +) from allennlp.modules.token_embedders.elmo_token_embedder import ElmoTokenEmbedder from allennlp.modules.token_embedders.empty_embedder import EmptyEmbedder from allennlp.modules.token_embedders.bag_of_word_counts_token_embedder import ( BagOfWordCountsTokenEmbedder, ) -from allennlp.modules.token_embedders.pass_through_token_embedder import PassThroughTokenEmbedder +from allennlp.modules.token_embedders.pass_through_token_embedder import ( + PassThroughTokenEmbedder, +) from allennlp.modules.token_embedders.pretrained_transformer_embedder import ( PretrainedTransformerEmbedder, ) diff --git a/allennlp/modules/token_embedders/embedding.py b/allennlp/modules/token_embedders/embedding.py index a68bdf8c4a1..a0796de710d 100644 --- a/allennlp/modules/token_embedders/embedding.py +++ b/allennlp/modules/token_embedders/embedding.py @@ -14,7 +14,11 @@ from allennlp.common import Tqdm from allennlp.common.checks import ConfigurationError -from allennlp.common.file_utils import cached_path, get_file_extension, is_url_or_existing_file +from allennlp.common.file_utils import ( + cached_path, + get_file_extension, + is_url_or_existing_file, +) from allennlp.data.vocabulary import Vocabulary from allennlp.modules.time_distributed import TimeDistributed from allennlp.modules.token_embedders.token_embedder import TokenEmbedder @@ -308,7 +312,10 @@ def extend_vocab( # It's easiest to just reload the embeddings for the entire vocab, # then only keep the ones we need. whole_weight = _read_pretrained_embeddings_file( - extension_pretrained_file, embedding_dim, extended_vocab, vocab_namespace + extension_pretrained_file, + embedding_dim, + extended_vocab, + vocab_namespace, ) extra_weight = whole_weight[self.num_embeddings :, :] @@ -441,18 +448,24 @@ def _read_embeddings_from_text_file( num_tokens_found += 1 else: logger.debug( - "Token %s was not found in the embedding file. Initialising randomly.", token + "Token %s was not found in the embedding file. Initialising randomly.", + token, ) logger.info( - "Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size + "Pretrained embeddings were found for %d out of %d tokens", + num_tokens_found, + vocab_size, ) return embedding_matrix def _read_embeddings_from_hdf5( - embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens" + embeddings_filename: str, + embedding_dim: int, + vocab: Vocabulary, + namespace: str = "tokens", ) -> torch.FloatTensor: """ Reads from a hdf5 formatted file. The embedding matrix is assumed to diff --git a/allennlp/modules/token_embedders/pretrained_transformer_embedder.py b/allennlp/modules/token_embedders/pretrained_transformer_embedder.py index 9903c310bd8..a102be910ee 100644 --- a/allennlp/modules/token_embedders/pretrained_transformer_embedder.py +++ b/allennlp/modules/token_embedders/pretrained_transformer_embedder.py @@ -194,7 +194,10 @@ def forward( # We call this with kwargs because some of the huggingface models don't have the # token_type_ids parameter and fail even when it's given as None. # Also, as of transformers v2.5.1, they are taking FloatTensor masks. - parameters = {"input_ids": token_ids, "attention_mask": transformer_mask.float()} + parameters = { + "input_ids": token_ids, + "attention_mask": transformer_mask.float(), + } if type_ids is not None: parameters["token_type_ids"] = type_ids @@ -214,7 +217,10 @@ def forward( if fold_long_sequences: embeddings = self._unfold_long_sequences( - embeddings, segment_concat_mask, batch_size, num_segment_concat_wordpieces + embeddings, + segment_concat_mask, + batch_size, + num_segment_concat_wordpieces, ) return embeddings @@ -264,7 +270,11 @@ def fold(tensor): # Shape: [batch_size, num_segment_concat_wordpieces] # Shape: [batch_size * num_segments, self._max_length] return tensor.reshape(-1, self._max_length) - return fold(token_ids), fold(mask), fold(type_ids) if type_ids is not None else None + return ( + fold(token_ids), + fold(mask), + fold(type_ids) if type_ids is not None else None, + ) def _unfold_long_sequences( self, @@ -338,7 +348,10 @@ def lengths_to_mask(lengths, max_len, device): embeddings = embeddings.reshape(batch_size, num_segments, self._max_length, embedding_size) embeddings = embeddings[ - :, :, self._num_added_start_tokens : embeddings.size(2) - self._num_added_end_tokens, : + :, + :, + self._num_added_start_tokens : embeddings.size(2) - self._num_added_end_tokens, + :, ] # truncate segment-level start/end tokens embeddings = embeddings.reshape(batch_size, -1, embedding_size) # flatten @@ -358,7 +371,9 @@ def lengths_to_mask(lengths, max_len, device): embeddings = torch.cat([embeddings, torch.zeros_like(end_token_embeddings)], 1) # Add end token embeddings back embeddings.scatter_( - 1, end_token_indices.unsqueeze(-1).expand_as(end_token_embeddings), end_token_embeddings + 1, + end_token_indices.unsqueeze(-1).expand_as(end_token_embeddings), + end_token_embeddings, ) # Now put back start tokens. We can do this before putting back end tokens, but then diff --git a/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py b/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py index 982ca1b1f46..e31f7f0fee4 100644 --- a/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py +++ b/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py @@ -3,7 +3,10 @@ from overrides import overrides import torch -from allennlp.modules.token_embedders import PretrainedTransformerEmbedder, TokenEmbedder +from allennlp.modules.token_embedders import ( + PretrainedTransformerEmbedder, + TokenEmbedder, +) from allennlp.nn import util @@ -105,7 +108,10 @@ def forward( """ # Shape: [batch_size, num_wordpieces, embedding_size]. embeddings = self._matched_embedder( - token_ids, wordpiece_mask, type_ids=type_ids, segment_concat_mask=segment_concat_mask + token_ids, + wordpiece_mask, + type_ids=type_ids, + segment_concat_mask=segment_concat_mask, ) # span_embeddings: (batch_size, num_orig_tokens, max_span_length, embedding_size) diff --git a/allennlp/modules/transformer/__init__.py b/allennlp/modules/transformer/__init__.py index f346ace8360..9650a999469 100644 --- a/allennlp/modules/transformer/__init__.py +++ b/allennlp/modules/transformer/__init__.py @@ -123,7 +123,9 @@ def forward(self, token_ids: torch.LongTensor, mask: torch.BoolTensor): ``` """ -from allennlp.modules.transformer.positional_encoding import SinusoidalPositionalEncoding +from allennlp.modules.transformer.positional_encoding import ( + SinusoidalPositionalEncoding, +) from allennlp.modules.transformer.transformer_module import TransformerModule from allennlp.modules.transformer.transformer_embeddings import ( @@ -133,7 +135,10 @@ def forward(self, token_ids: torch.LongTensor, mask: torch.BoolTensor): ) from allennlp.modules.transformer.self_attention import SelfAttention from allennlp.modules.transformer.activation_layer import ActivationLayer -from allennlp.modules.transformer.transformer_layer import AttentionLayer, TransformerLayer +from allennlp.modules.transformer.transformer_layer import ( + AttentionLayer, + TransformerLayer, +) from allennlp.modules.transformer.transformer_stack import TransformerStack from allennlp.modules.transformer.transformer_pooler import TransformerPooler from allennlp.modules.transformer.output_layer import OutputLayer diff --git a/allennlp/modules/transformer/bimodal_connection_layer.py b/allennlp/modules/transformer/bimodal_connection_layer.py index 5d7e4f7fc88..26834b97658 100644 --- a/allennlp/modules/transformer/bimodal_connection_layer.py +++ b/allennlp/modules/transformer/bimodal_connection_layer.py @@ -31,7 +31,10 @@ def forward(self, hidden_states1, input_tensor1, hidden_states2, input_tensor2): class BiModalConnectionLayer(TransformerModule, FromParams): - _huggingface_mapping = {"biAttention": "bimodal_attention", "biOutput": "bimodal_output"} + _huggingface_mapping = { + "biAttention": "bimodal_attention", + "biOutput": "bimodal_output", + } def __init__( self, diff --git a/allennlp/modules/transformer/self_attention.py b/allennlp/modules/transformer/self_attention.py index 6db6aba1fad..7ff7385759c 100644 --- a/allennlp/modules/transformer/self_attention.py +++ b/allennlp/modules/transformer/self_attention.py @@ -134,7 +134,10 @@ def forward( @classmethod def _get_mapping( - cls, pretrained_module=None, source="huggingface", mapping: Optional[Dict[str, str]] = None + cls, + pretrained_module=None, + source="huggingface", + mapping: Optional[Dict[str, str]] = None, ): combined_mapping = {} if "huggingface" in source: diff --git a/allennlp/modules/transformer/transformer_embeddings.py b/allennlp/modules/transformer/transformer_embeddings.py index df0e53c4544..6f9053c8388 100644 --- a/allennlp/modules/transformer/transformer_embeddings.py +++ b/allennlp/modules/transformer/transformer_embeddings.py @@ -71,7 +71,10 @@ def __init__(self, feature_size: int, embedding_size: int, dropout: float = 0.0) image_embeddings = torch.nn.Linear(feature_size, embedding_size) location_embeddings = torch.nn.Linear(4, embedding_size) embeddings = torch.nn.ModuleDict( - {"image_embeddings": image_embeddings, "location_embeddings": location_embeddings} + { + "image_embeddings": image_embeddings, + "location_embeddings": location_embeddings, + } ) super().__init__(embeddings, embedding_size, dropout) diff --git a/allennlp/modules/transformer/transformer_layer.py b/allennlp/modules/transformer/transformer_layer.py index 3282b2dbf14..70ec1ff56c1 100644 --- a/allennlp/modules/transformer/transformer_layer.py +++ b/allennlp/modules/transformer/transformer_layer.py @@ -158,10 +158,14 @@ def __init__( ) self.intermediate = ActivationLayer( - hidden_size=hidden_size, intermediate_size=intermediate_size, activation=activation + hidden_size=hidden_size, + intermediate_size=intermediate_size, + activation=activation, ) self.output = OutputLayer( - input_size=intermediate_size, hidden_size=hidden_size, dropout=hidden_dropout + input_size=intermediate_size, + hidden_size=hidden_size, + dropout=hidden_dropout, ) def forward( diff --git a/allennlp/modules/transformer/transformer_module.py b/allennlp/modules/transformer/transformer_module.py index 11b650d84ec..c2c1931ce19 100644 --- a/allennlp/modules/transformer/transformer_module.py +++ b/allennlp/modules/transformer/transformer_module.py @@ -48,7 +48,10 @@ def _get_mapping( @classmethod def _get_mapped_submodules( - cls, pretrained_module, source="huggingface", mapping: Optional[Dict[str, str]] = None + cls, + pretrained_module, + source="huggingface", + mapping: Optional[Dict[str, str]] = None, ): """ Subclasses overload this method, and provide appropriate name mapping based on the source. diff --git a/allennlp/modules/transformer/transformer_stack.py b/allennlp/modules/transformer/transformer_stack.py index edeefc27ba9..b475cce7358 100644 --- a/allennlp/modules/transformer/transformer_stack.py +++ b/allennlp/modules/transformer/transformer_stack.py @@ -124,7 +124,12 @@ def forward( return tuple( v - for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] + for v in [ + hidden_states, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] if v is not None ) diff --git a/allennlp/modules/transformer/util.py b/allennlp/modules/transformer/util.py index 33dfcf77859..80732dc21cf 100644 --- a/allennlp/modules/transformer/util.py +++ b/allennlp/modules/transformer/util.py @@ -3,7 +3,8 @@ def apply_mask( - values: torch.FloatTensor, mask: Union[torch.BoolTensor, torch.IntTensor, torch.FloatTensor] + values: torch.FloatTensor, + mask: Union[torch.BoolTensor, torch.IntTensor, torch.FloatTensor], ) -> torch.FloatTensor: """ # Parameters diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py index fff07b7dac2..dcc795550fc 100644 --- a/allennlp/nn/beam_search.py +++ b/allennlp/nn/beam_search.py @@ -68,7 +68,10 @@ class Sampler(Registrable): default_implementation = "deterministic" def init_state( - self, start_class_log_probabilities: torch.Tensor, batch_size: int, num_classes: int + self, + start_class_log_probabilities: torch.Tensor, + batch_size: int, + num_classes: int, ) -> StateType: return {} @@ -192,7 +195,9 @@ def sample_nodes( # NOTE: These indices are not indices into `log_probs`, they are indices into `top_k_log_probs`. # shape: (batch_size, per_node_beam_size) sampled_indices = torch.multinomial( - normalized_top_k_probs, per_node_beam_size, replacement=self.with_replacement + normalized_top_k_probs, + per_node_beam_size, + replacement=self.with_replacement, ) # Convert `sampled_indices` back to indices in the original `log_probs` tensor. @@ -279,7 +284,9 @@ def sample_nodes( # NOTE: These indices are not indices into `log_probs`, they are indices into `log_probs_descending`. # shape: (batch_size, per_node_beam_size) sampled_indices = torch.multinomial( - filtered_probabilities, per_node_beam_size, replacement=self.with_replacement + filtered_probabilities, + per_node_beam_size, + replacement=self.with_replacement, ) # Convert `sampled_indices` back to indices in the original `log_probs` tensor. @@ -311,7 +318,10 @@ def __init__(self, temperature: float = 1.0): @overrides def init_state( - self, start_class_log_probabilities: torch.Tensor, batch_size: int, num_classes: int + self, + start_class_log_probabilities: torch.Tensor, + batch_size: int, + num_classes: int, ) -> StateType: # shape: (batch_size, num_classes) zeros = start_class_log_probabilities.new_zeros((batch_size, num_classes)) @@ -400,7 +410,11 @@ def sample_beams( # shape: (batch_size * beam_size,) phi_S = selected_log_probs.reshape(batch_size * beam_size) - return selected_log_probs, selected_indices, {"G_phi_S": G_phi_S_new, "phi_S": phi_S} + return ( + selected_log_probs, + selected_indices, + {"G_phi_S": G_phi_S_new, "phi_S": phi_S}, + ) def gumbel(self, phi) -> torch.Tensor: """ @@ -579,7 +593,9 @@ def search( old_step = cast(StepFunctionTypeNoTimestep, step) def new_step( - last_predictions: torch.Tensor, state: Dict[str, torch.Tensor], time_step: int + last_predictions: torch.Tensor, + state: Dict[str, torch.Tensor], + time_step: int, ): return old_step(last_predictions, state) @@ -692,7 +708,7 @@ def _search( ) # shape (both): (batch_size * beam_size, per_node_beam_size) - top_log_probabilities, predicted_classes, sampler_state = self.sampler.sample_nodes( + (top_log_probabilities, predicted_classes, sampler_state,) = self.sampler.sample_nodes( cleaned_log_probabilities, self.per_node_beam_size, sampler_state ) diff --git a/allennlp/nn/chu_liu_edmonds.py b/allennlp/nn/chu_liu_edmonds.py index 74d9726ddcc..fa15d9e9265 100644 --- a/allennlp/nn/chu_liu_edmonds.py +++ b/allennlp/nn/chu_liu_edmonds.py @@ -69,7 +69,13 @@ def decode_mst( # The main algorithm operates inplace. chu_liu_edmonds( - length, score_matrix, current_nodes, final_edges, old_input, old_output, representatives + length, + score_matrix, + current_nodes, + final_edges, + old_input, + old_output, + representatives, ) heads = numpy.zeros([max_length], numpy.int32) @@ -224,7 +230,13 @@ def chu_liu_edmonds( representatives[cycle_representative].add(node) chu_liu_edmonds( - length, score_matrix, current_nodes, final_edges, old_input, old_output, representatives + length, + score_matrix, + current_nodes, + final_edges, + old_input, + old_output, + representatives, ) # Expansion stage. diff --git a/allennlp/nn/initializers.py b/allennlp/nn/initializers.py index 7d12dd9ba8a..c50aac28370 100644 --- a/allennlp/nn/initializers.py +++ b/allennlp/nn/initializers.py @@ -245,7 +245,10 @@ class KaimingUniformInitializer(_InitializerWrapper): def __init__(self, a: float = 0.0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"): super().__init__( - init_function=torch.nn.init.kaiming_uniform_, a=a, mode=mode, nonlinearity=nonlinearity + init_function=torch.nn.init.kaiming_uniform_, + a=a, + mode=mode, + nonlinearity=nonlinearity, ) @@ -257,7 +260,10 @@ class KaimingNormalInitializer(_InitializerWrapper): def __init__(self, a: float = 0.0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"): super().__init__( - init_function=torch.nn.init.kaiming_normal_, a=a, mode=mode, nonlinearity=nonlinearity + init_function=torch.nn.init.kaiming_normal_, + a=a, + mode=mode, + nonlinearity=nonlinearity, ) @@ -463,7 +469,9 @@ class InitializerApplicator(FromParams): """ def __init__( - self, regexes: List[Tuple[str, Initializer]] = None, prevent_regexes: List[str] = None + self, + regexes: List[Tuple[str, Initializer]] = None, + prevent_regexes: List[str] = None, ) -> None: self._initializers = regexes or [] self._prevent_regex = None diff --git a/allennlp/nn/util.py b/allennlp/nn/util.py index 9cc1c313156..31cf3335b25 100644 --- a/allennlp/nn/util.py +++ b/allennlp/nn/util.py @@ -172,7 +172,12 @@ def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor): # sequence lengths and returning the now sorted indices. _, reverse_mapping = permutation_index.sort(0, descending=False) restoration_indices = index_range.index_select(0, reverse_mapping) - return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index + return ( + sorted_tensor, + sorted_sequence_lengths, + restoration_indices, + permutation_index, + ) def get_final_encoder_states( @@ -784,7 +789,9 @@ def sequence_cross_entropy_with_logits( # shape : (2,) alpha_factor = torch.tensor( - [1.0 - float(alpha), float(alpha)], dtype=weights.dtype, device=weights.device + [1.0 - float(alpha), float(alpha)], + dtype=weights.dtype, + device=weights.device, ) elif isinstance(alpha, (list, numpy.ndarray, torch.Tensor)): @@ -1266,7 +1273,10 @@ def batched_index_select( def masked_index_fill( - target: torch.Tensor, indices: torch.LongTensor, mask: torch.BoolTensor, fill_value: int = 1 + target: torch.Tensor, + indices: torch.LongTensor, + mask: torch.BoolTensor, + fill_value: int = 1, ) -> torch.Tensor: """ The given `indices` in `target` will be will be filled with `fill_value` given a `mask`. @@ -1508,7 +1518,10 @@ def bucket_values( def add_sentence_boundary_token_ids( - tensor: torch.Tensor, mask: torch.BoolTensor, sentence_begin_token: Any, sentence_end_token: Any + tensor: torch.Tensor, + mask: torch.BoolTensor, + sentence_begin_token: Any, + sentence_end_token: Any, ) -> Tuple[torch.Tensor, torch.BoolTensor]: """ Add begin/end of sentence tokens to the batch of sentences. @@ -1742,7 +1755,9 @@ def find_text_field_embedder(model: torch.nn.Module) -> torch.nn.Module: first one, as it's very rare to have more than one. If there isn't a `TextFieldEmbedder` in the given `Model`, we raise a `ValueError`. """ - from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder + from allennlp.modules.text_field_embedders.text_field_embedder import ( + TextFieldEmbedder, + ) for module in model.modules(): if isinstance(module, TextFieldEmbedder): @@ -1764,7 +1779,9 @@ def find_embedding_layer(model: torch.nn.Module) -> torch.nn.Module: from transformers.models.bert.modeling_bert import BertEmbeddings from transformers.models.albert.modeling_albert import AlbertEmbeddings from transformers.models.roberta.modeling_roberta import RobertaEmbeddings - from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder + from allennlp.modules.text_field_embedders.text_field_embedder import ( + TextFieldEmbedder, + ) from allennlp.modules.text_field_embedders.basic_text_field_embedder import ( BasicTextFieldEmbedder, ) diff --git a/allennlp/predictors/sentence_tagger.py b/allennlp/predictors/sentence_tagger.py index 04d19e92916..e795e8147a8 100644 --- a/allennlp/predictors/sentence_tagger.py +++ b/allennlp/predictors/sentence_tagger.py @@ -23,7 +23,10 @@ class SentenceTaggerPredictor(Predictor): """ def __init__( - self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm" + self, + model: Model, + dataset_reader: DatasetReader, + language: str = "en_core_web_sm", ) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyTokenizer(language=language, pos_tags=True) diff --git a/allennlp/tools/archive_surgery.py b/allennlp/tools/archive_surgery.py index fc1014d23fc..4cad447b0ae 100644 --- a/allennlp/tools/archive_surgery.py +++ b/allennlp/tools/archive_surgery.py @@ -46,7 +46,10 @@ def main(): help="overwrite the input file with the modified configuration", ) parser.add_argument( - "-f", "--force", action="store_true", help="overwrite the output file if it exists" + "-f", + "--force", + action="store_true", + help="overwrite the output file if it exists", ) args = parser.parse_args() diff --git a/allennlp/tools/create_elmo_embeddings_from_vocab.py b/allennlp/tools/create_elmo_embeddings_from_vocab.py index a0c3c6f4c77..47e3c372da7 100644 --- a/allennlp/tools/create_elmo_embeddings_from_vocab.py +++ b/allennlp/tools/create_elmo_embeddings_from_vocab.py @@ -108,13 +108,19 @@ def main( help="A path to a vocabulary file to generate representations for.", ) parser.add_argument( - "--elmo_config", type=str, help="The path to a directory containing an ELMo config file." + "--elmo_config", + type=str, + help="The path to a directory containing an ELMo config file.", ) parser.add_argument( - "--elmo_weights", type=str, help="The path to a directory containing an ELMo weight file." + "--elmo_weights", + type=str, + help="The path to a directory containing an ELMo weight file.", ) parser.add_argument( - "--output_dir", type=str, help="The output directory to store the serialised embeddings." + "--output_dir", + type=str, + help="The output directory to store the serialised embeddings.", ) parser.add_argument("--batch_size", type=int, default=64, help="The batch size to use.") parser.add_argument("--device", type=int, default=-1, help="The device to run on.") diff --git a/allennlp/training/__init__.py b/allennlp/training/__init__.py index 2489a2c6cff..b2d2a3d376c 100644 --- a/allennlp/training/__init__.py +++ b/allennlp/training/__init__.py @@ -8,4 +8,3 @@ TrackEpochCallback, TensorBoardCallback, ) -from allennlp.training.deepspeed import DeepspeedTrainer # TODO: make this optional \ No newline at end of file diff --git a/allennlp/training/checkpointer.py b/allennlp/training/checkpointer.py index 2833f1a6721..f98eff9ee78 100644 --- a/allennlp/training/checkpointer.py +++ b/allennlp/training/checkpointer.py @@ -63,7 +63,10 @@ def __init__( self._last_save_time = time.time() def maybe_save_checkpoint( - self, trainer: "allennlp.training.trainer.Trainer", epoch: int, batches_this_epoch: int + self, + trainer: "allennlp.training.trainer.Trainer", + epoch: int, + batches_this_epoch: int, ) -> None: """ Given amount of time lapsed between the last save and now (tracked internally), the @@ -76,7 +79,7 @@ def maybe_save_checkpoint( only looks at time, not batch or epoch number, though those parameters are available to you if you want to customize the behavior of this function. """ - if not trainer._master: + if not trainer._primary: return if self._model_save_interval is None: return @@ -94,8 +97,9 @@ def save_checkpoint( is_best_so_far: bool = False, save_model_only=False, ) -> None: - if not trainer._master: + if not trainer._primary: return + if self._serialization_dir is not None: with trainer.get_checkpoint_state() as state: model_state, training_states = state diff --git a/allennlp/training/deepspeed/__init__.py b/allennlp/training/deepspeed/__init__.py index 5c709ed7356..e5f1e5c47e8 100644 --- a/allennlp/training/deepspeed/__init__.py +++ b/allennlp/training/deepspeed/__init__.py @@ -1,8 +1,2 @@ from allennlp.training.deepspeed.trainer import DeepspeedTrainer - -# from allennlp.training.deepspeed.optimizers import ( -# FusedAdamOptimizer, -# DeepspeedCPUAdamOptimizer, -# FusedLambOptimizer -# ) from allennlp.training.deepspeed import optimizers diff --git a/allennlp/training/deepspeed/checkpointer.py b/allennlp/training/deepspeed/checkpointer.py index b21c675362d..18adcd12007 100644 --- a/allennlp/training/deepspeed/checkpointer.py +++ b/allennlp/training/deepspeed/checkpointer.py @@ -3,15 +3,11 @@ import logging import os import shutil -import time from overrides import overrides from pathlib import Path -import torch - import allennlp -from allennlp.nn import util as nn_util from allennlp.training import Checkpointer logger = logging.getLogger(__name__) @@ -28,14 +24,12 @@ def save_checkpoint( ) -> None: if self._serialization_dir is None: return - + super().save_checkpoint(epoch, trainer, is_best_so_far, save_model_only) checkpoint_id = "deepspeed_epoch_{}".format(epoch) - model_path = os.path.join(self._serialization_dir, "model_state_epoch_{}".format(epoch)) - trainer.model_engine.save_checkpoint(self._serialization_dir, checkpoint_id) - if trainer._master and is_best_so_far: + if trainer._primary and is_best_so_far: engine_dir = os.path.join(self._serialization_dir, "best_deepspeed") shutil.rmtree(engine_dir, ignore_errors=True) # in case no previous checkpoints shutil.copytree(os.path.join(self._serialization_dir, checkpoint_id), engine_dir) diff --git a/allennlp/training/deepspeed/optimizers.py b/allennlp/training/deepspeed/optimizers.py index 226adcb67ce..db13427235f 100644 --- a/allennlp/training/deepspeed/optimizers.py +++ b/allennlp/training/deepspeed/optimizers.py @@ -2,37 +2,10 @@ import torch -from deepspeed.ops.adam import DeepSpeedCPUAdam from deepspeed.ops.lamb import FusedLamb - -# from deepspeed.runtime.fp16.onebit_adam import OnebitAdam - from allennlp.training.optimizers import Optimizer, make_parameter_groups -# This does not currently work -@Optimizer.register("cpu_adam") -class DeepspeedCPUAdamOptimizer(Optimizer, DeepSpeedCPUAdam): - def __init__( - self, - model_parameters: List[Tuple[str, torch.nn.Parameter]], - parameter_groups: List[Tuple[List[str], Dict[str, Any]]] = None, - lr: float = 0.001, - betas: Tuple[float, float] = (0.9, 0.999), - eps: float = 1e-08, - weight_decay: float = 0.0, - amsgrad: bool = False, - ): - super().__init__( - model_params=make_parameter_groups(model_parameters, parameter_groups), - lr=lr, - betas=betas, - eps=eps, - weight_decay=weight_decay, - amsgrad=amsgrad, - ) - - @Optimizer.register("fused_lamb") class FusedLambOptimizer(Optimizer, FusedLamb): def __init__( diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py index f8920ff0a7a..601d057432a 100644 --- a/allennlp/training/deepspeed/trainer.py +++ b/allennlp/training/deepspeed/trainer.py @@ -1,11 +1,7 @@ -import datetime import logging -import os import re import time -import traceback -from contextlib import contextmanager -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Union from overrides import overrides import torch @@ -17,21 +13,17 @@ from allennlp.common import Lazy, Tqdm from allennlp.common import util as common_util from allennlp.common.checks import ConfigurationError -from allennlp.data import DataLoader -from allennlp.data.dataloader import TensorDict +from allennlp.data import DataLoader, TensorDict from allennlp.models.model import Model from allennlp.nn import util as nn_util from allennlp.training import util as training_util from allennlp.training.checkpointer import Checkpointer from allennlp.training.moving_average import MovingAverage from allennlp.training.optimizers import Optimizer -from allennlp.training.tensorboard_writer import TensorboardWriter from allennlp.training.trainer import ( Trainer, GradientDescentTrainer, - BatchCallback, - EpochCallback, TrainerCallback, ) @@ -57,12 +49,8 @@ def __init__( serialization_dir: Optional[str] = None, checkpointer: Checkpointer = None, cuda_device: Optional[Union[int, torch.device]] = None, - tensorboard_writer: TensorboardWriter = None, moving_average: Optional[MovingAverage] = None, - batch_callbacks: List[BatchCallback] = None, - epoch_callbacks: List[EpochCallback] = None, - end_callbacks: List[EpochCallback] = None, - trainer_callbacks: List[TrainerCallback] = None, + callbacks: List[TrainerCallback] = None, distributed: bool = False, local_rank: int = 0, world_size: int = 1, @@ -78,14 +66,10 @@ def __init__( num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, - tensorboard_writer=tensorboard_writer, checkpointer=checkpointer, moving_average=moving_average, - batch_callbacks=batch_callbacks, - epoch_callbacks=epoch_callbacks, - end_callbacks=end_callbacks, - trainer_callbacks=trainer_callbacks, - distributed=False, # Avoid DDP init + callbacks=callbacks, + distributed=False, # Avoid DDP init local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, @@ -95,7 +79,6 @@ def __init__( self.model_engine = deepspeed_engine self._distributed = True - # serialization_dir = None if checkpointer is None and serialization_dir is not None: self._checkpointer = DeepspeedCheckpointer(serialization_dir) @@ -160,7 +143,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown - if self._master: + if self._primary: batch_generator_tqdm = Tqdm.tqdm(batch_generator, total=num_training_batches) else: batch_generator_tqdm = batch_generator @@ -175,8 +158,6 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total - # if not self._master: - # print(f'Rank {self._rank}: {batch_num_total}') batch_outputs = self.batch_outputs(batch, for_training=True) @@ -194,20 +175,6 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: self.model_engine.backward(loss) self.model_engine.step() - param_updates = None - if self._tensorboard.should_log_histograms_this_batch() and self._master: - # Get the magnitude of parameter updates for logging. We need to do some - # computation before and after the optimizer step, and it's expensive because of - # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so - # we don't do this every batch, only when it's requested. - param_updates = { - name: param.detach().cpu().clone() - for name, param in self.model.named_parameters() - } - - for name, param in self.model.named_parameters(): - param_updates[name].sub_(param.detach().cpu()) - # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) @@ -224,43 +191,30 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: cuda_device=self.cuda_device, ) - if self._master: + if self._primary: # Updating tqdm only for the master as the trainers wouldn't have one description = training_util.description_from_metrics(metrics) batch_generator_tqdm.set_description(description, refresh=False) - self._tensorboard.log_batch( - self.model, - self.optimizer, - 0.0, - metrics, - batch, - param_updates, - ) - # if self._checkpointer is not None: - # self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) + if self._checkpointer is not None: + self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) - for callback in self._batch_callbacks: - callback( + for callback in self._callbacks: + callback.on_batch( self, batch, - [batch_outputs], + batch_outputs, metrics, epoch, batches_this_epoch, is_training=True, - is_master=self._master, + is_primary=self._primary, + batch_grad_norm=None, # not yet implemented for DeepspeedTrainer ) - # if not self._master: - # print(f'Rank {self._rank}: {batches_this_epoch}') - if self._distributed: dist.barrier() - # if not self._master: - # print(f'Rank {self._rank}: Passed barrier') - metrics = training_util.get_metrics( self.model, train_loss, @@ -279,168 +233,6 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) return metrics - def __try_train(self) -> Dict[str, Any]: - try: - epoch_counter = self._restore_checkpoint() - except RuntimeError: - traceback.print_exc() - raise ConfigurationError( - "Could not recover training from the checkpoint. Did you mean to output to " - "a different serialization directory or delete the existing serialization " - "directory?" - ) - - training_util.enable_gradient_clipping(self.model, self._grad_clipping) - - logger.info("Beginning training.") - - val_metrics: Dict[str, float] = {} - this_epoch_val_metric: float = 0.0 - metrics: Dict[str, Any] = {} - epochs_trained = 0 - training_start_time = time.time() - - metrics["best_epoch"] = self._metric_tracker.best_epoch - for key, value in self._metric_tracker.best_epoch_metrics.items(): - metrics["best_validation_" + key] = value - - for callback in self._epoch_callbacks: - callback(self, metrics={}, epoch=-1, is_master=self._master) - - for epoch in range(epoch_counter, self._num_epochs): - epoch_start_time = time.time() - logger.info("Training epoch.") - train_metrics = self._train_epoch(epoch) - - # if self._master and self._checkpointer is not None: - if self._checkpointer is not None: - self._checkpointer.save_checkpoint(epoch, self, save_model_only=True) - - # # Wait for the master to finish saving the model checkpoint - if self._distributed: - dist.barrier() - - # logger.info("Passed start of epoch checkpoint barrier.") - - # get peak of memory usage - for key, value in train_metrics.items(): - if key.startswith("gpu_") and key.endswith("_memory_MB"): - metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) - elif key.startswith("worker_") and key.endswith("_memory_MB"): - metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) - - if self._validation_data_loader is not None: - with torch.no_grad(): - # We have a validation set, so compute all the metrics on it. - val_loss, val_reg_loss, num_batches = self._validation_loss(epoch) - - # It is safe again to wait till the validation is done. This is - # important to get the metrics right. - if self._distributed: - dist.barrier() - - # logger.info("Passed validation metrics barrier.") - - val_metrics = training_util.get_metrics( - self.model, - val_loss, - val_reg_loss, - batch_loss=None, - batch_reg_loss=None, - num_batches=num_batches, - reset=True, - world_size=self._world_size, - cuda_device=self.cuda_device, - ) - - # Check validation metric for early stopping - this_epoch_val_metric = val_metrics[self._validation_metric] - self._metric_tracker.add_metric(this_epoch_val_metric) - - if self._metric_tracker.should_stop_early(): - logger.info("Ran out of patience. Stopping training.") - break - - if self._master: - self._tensorboard.log_metrics( - train_metrics, val_metrics=val_metrics, log_to_console=True, epoch=epoch + 1 - ) # +1 because tensorboard doesn't like 0 - - # Create overall metrics dict - training_elapsed_time = time.time() - training_start_time - metrics["training_duration"] = str(datetime.timedelta(seconds=training_elapsed_time)) - metrics["training_start_epoch"] = epoch_counter - metrics["training_epochs"] = epochs_trained - metrics["epoch"] = epoch - - for key, value in train_metrics.items(): - metrics["training_" + key] = value - for key, value in val_metrics.items(): - metrics["validation_" + key] = value - - if self._metric_tracker.is_best_so_far(): - # Update all the best_ metrics. - # (Otherwise they just stay the same as they were.) - metrics["best_epoch"] = epoch - for key, value in val_metrics.items(): - metrics["best_validation_" + key] = value - - self._metric_tracker.best_epoch_metrics = val_metrics - - if self._serialization_dir and self._master: - common_util.dump_metrics( - os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), - metrics, - ) - - # The Scheduler API is agnostic to whether your schedule requires a validation metric - - # if it doesn't, the validation metric passed here is ignored. - if self._learning_rate_scheduler: - self._learning_rate_scheduler.step(this_epoch_val_metric) - if self._momentum_scheduler: - self._momentum_scheduler.step(this_epoch_val_metric) - - # if self._master and self._checkpointer is not None: - if self._checkpointer is not None: - self._checkpointer.save_checkpoint( - epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() - ) - - # logger.info("Starting end of epoch checkpoint barrier...") - # Wait for the master to finish saving the checkpoint - if self._distributed: - dist.barrier() - - # logger.info("Passed end of epoch checkpoint barrier.") - - for callback in self._epoch_callbacks: - callback(self, metrics=metrics, epoch=epoch, is_master=self._master) - - epoch_elapsed_time = time.time() - epoch_start_time - logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time)) - - if epoch < self._num_epochs - 1: - training_elapsed_time = time.time() - training_start_time - estimated_time_remaining = training_elapsed_time * ( - (self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1 - ) - formatted_time = str(datetime.timedelta(seconds=int(estimated_time_remaining))) - logger.info("Estimated training time remaining: %s", formatted_time) - - epochs_trained += 1 - - for callback in self._end_callbacks: - callback(self, metrics=metrics, epoch=epoch, is_master=self._master) - - # Load the best model state before returning - best_model_state = ( - None if self._checkpointer is None else self._checkpointer.best_model_state() - ) - if best_model_state: - self.model.load_state_dict(best_model_state) - - return metrics - def _restore_checkpoint(self) -> int: """ Restores the model and training state from the last saved checkpoint. @@ -460,7 +252,11 @@ def _restore_checkpoint(self) -> int: return 0 self._checkpointer: DeepspeedCheckpointer - checkpoint_id, model_state, training_state = self._checkpointer.restore_checkpoint() + ( + checkpoint_id, + model_state, + training_state, + ) = self._checkpointer.restore_checkpoint() if not training_state: # No checkpoint to restore, start at 0 @@ -514,13 +310,10 @@ def from_partial_objects( optimizer: Lazy[Optimizer] = Lazy(Optimizer.default), deepspeed_optimizer: Dict[str, Any] = None, deepspeed_args: Lazy[DeepspeedArgs] = Lazy(DeepspeedArgs), - tensorboard_writer: Lazy[TensorboardWriter] = Lazy(TensorboardWriter), moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = Lazy(DeepspeedCheckpointer), - batch_callbacks: List[BatchCallback] = None, - epoch_callbacks: List[EpochCallback] = None, - end_callbacks: List[EpochCallback] = None, - trainer_callbacks: List[TrainerCallback] = None, + callbacks: List[Lazy[TrainerCallback]] = None, + trainer_callbacks: List[Lazy[TrainerCallback]] = None, ) -> "DeepspeedTrainer": if no_grad: for name, parameter in model.named_parameters(): @@ -535,7 +328,6 @@ def from_partial_objects( ) checkpointer_ = checkpointer.construct(serialization_dir=serialization_dir) - tensorboard_writer_ = tensorboard_writer.construct(serialization_dir=serialization_dir) if deepspeed_config.optimizer: optim_ = None @@ -546,18 +338,28 @@ def from_partial_objects( local_rank=local_rank ) - if not hasattr(data_loader, 'batch_size'): - raise ConfigurationError("Please specify your batch size in Deepspeed config if not using AllennlpDataLoader.") + if not hasattr(data_loader, "batch_size"): + raise ConfigurationError( + "Please specify your batch size in Deepspeed config if not using AllennlpDataLoader." + ) model_engine = DeepspeedTrainer._build_engine( model, optim_, deepspeed_config, deepspeed_args_, - data_loader.batch_size, # type: ignore + data_loader.batch_size, # type: ignore num_gradient_accumulation_steps, ) + callbacks = callbacks or trainer_callbacks or [] + + callbacks_: List[TrainerCallback] = [] + + for callback in callbacks: + callback_ = callback.construct(serialization_dir=serialization_dir) + callbacks_.append(callback_) + return cls( model, data_loader, @@ -568,13 +370,9 @@ def from_partial_objects( num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, - tensorboard_writer=tensorboard_writer_, checkpointer=checkpointer_, moving_average=moving_average_, - batch_callbacks=batch_callbacks, - epoch_callbacks=epoch_callbacks, - end_callbacks=end_callbacks, - trainer_callbacks=trainer_callbacks, + callbacks=callbacks_, distributed=False, local_rank=local_rank, world_size=world_size, @@ -609,6 +407,7 @@ def _build_engine( config_params=config, ) if hasattr(ds, "timers"): + def mute_log(*args, **kwargs): pass diff --git a/allennlp/training/learning_rate_schedulers/__init__.py b/allennlp/training/learning_rate_schedulers/__init__.py index 899bf5cb91f..d1301c321ce 100644 --- a/allennlp/training/learning_rate_schedulers/__init__.py +++ b/allennlp/training/learning_rate_schedulers/__init__.py @@ -22,9 +22,15 @@ ExponentialLearningRateScheduler, ReduceOnPlateauLearningRateScheduler, ) -from allennlp.training.learning_rate_schedulers.combined import CombinedLearningRateScheduler +from allennlp.training.learning_rate_schedulers.combined import ( + CombinedLearningRateScheduler, +) from allennlp.training.learning_rate_schedulers.cosine import CosineWithRestarts from allennlp.training.learning_rate_schedulers.noam import NoamLR -from allennlp.training.learning_rate_schedulers.slanted_triangular import SlantedTriangular +from allennlp.training.learning_rate_schedulers.slanted_triangular import ( + SlantedTriangular, +) from allennlp.training.learning_rate_schedulers.polynomial_decay import PolynomialDecay -from allennlp.training.learning_rate_schedulers.linear_with_warmup import LinearWithWarmup +from allennlp.training.learning_rate_schedulers.linear_with_warmup import ( + LinearWithWarmup, +) diff --git a/allennlp/training/learning_rate_schedulers/combined.py b/allennlp/training/learning_rate_schedulers/combined.py index c49e9a26cc8..78f333e064a 100644 --- a/allennlp/training/learning_rate_schedulers/combined.py +++ b/allennlp/training/learning_rate_schedulers/combined.py @@ -4,7 +4,9 @@ import torch from allennlp.common.lazy import Lazy -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) @LearningRateScheduler.register("combined") diff --git a/allennlp/training/learning_rate_schedulers/cosine.py b/allennlp/training/learning_rate_schedulers/cosine.py index d9311dde387..87815d48a1a 100644 --- a/allennlp/training/learning_rate_schedulers/cosine.py +++ b/allennlp/training/learning_rate_schedulers/cosine.py @@ -4,7 +4,9 @@ import numpy as np import torch -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) logger = logging.getLogger(__name__) diff --git a/allennlp/training/learning_rate_schedulers/learning_rate_scheduler.py b/allennlp/training/learning_rate_schedulers/learning_rate_scheduler.py index 84304a6c1ec..e5d3a727060 100644 --- a/allennlp/training/learning_rate_schedulers/learning_rate_scheduler.py +++ b/allennlp/training/learning_rate_schedulers/learning_rate_scheduler.py @@ -58,7 +58,11 @@ class StepLearningRateScheduler(_PyTorchLearningRateSchedulerWrapper): """ def __init__( - self, optimizer: Optimizer, step_size: int, gamma: float = 0.1, last_epoch: int = -1 + self, + optimizer: Optimizer, + step_size: int, + gamma: float = 0.1, + last_epoch: int = -1, ) -> None: lr_scheduler = torch.optim.lr_scheduler.StepLR( optimizer=optimizer, step_size=step_size, gamma=gamma, last_epoch=last_epoch @@ -74,10 +78,17 @@ class MultiStepLearningRateScheduler(_PyTorchLearningRateSchedulerWrapper): """ def __init__( - self, optimizer: Optimizer, milestones: List[int], gamma: float = 0.1, last_epoch: int = -1 + self, + optimizer: Optimizer, + milestones: List[int], + gamma: float = 0.1, + last_epoch: int = -1, ) -> None: lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( - optimizer=optimizer, milestones=milestones, gamma=gamma, last_epoch=last_epoch + optimizer=optimizer, + milestones=milestones, + gamma=gamma, + last_epoch=last_epoch, ) super().__init__(lr_scheduler) diff --git a/allennlp/training/learning_rate_schedulers/linear_with_warmup.py b/allennlp/training/learning_rate_schedulers/linear_with_warmup.py index 09b6c839f9e..98e901f0877 100644 --- a/allennlp/training/learning_rate_schedulers/linear_with_warmup.py +++ b/allennlp/training/learning_rate_schedulers/linear_with_warmup.py @@ -1,7 +1,9 @@ import torch from allennlp.training.learning_rate_schedulers import PolynomialDecay -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) @LearningRateScheduler.register("linear_with_warmup") diff --git a/allennlp/training/learning_rate_schedulers/noam.py b/allennlp/training/learning_rate_schedulers/noam.py index e04aacd46d3..9c17f862c33 100644 --- a/allennlp/training/learning_rate_schedulers/noam.py +++ b/allennlp/training/learning_rate_schedulers/noam.py @@ -1,7 +1,9 @@ from overrides import overrides import torch -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) @LearningRateScheduler.register("noam") diff --git a/allennlp/training/learning_rate_schedulers/polynomial_decay.py b/allennlp/training/learning_rate_schedulers/polynomial_decay.py index 93cb6112455..1a201b99c88 100644 --- a/allennlp/training/learning_rate_schedulers/polynomial_decay.py +++ b/allennlp/training/learning_rate_schedulers/polynomial_decay.py @@ -1,7 +1,9 @@ from overrides import overrides import torch -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) @LearningRateScheduler.register("polynomial_decay") diff --git a/allennlp/training/learning_rate_schedulers/slanted_triangular.py b/allennlp/training/learning_rate_schedulers/slanted_triangular.py index e9166b39864..3567564600c 100644 --- a/allennlp/training/learning_rate_schedulers/slanted_triangular.py +++ b/allennlp/training/learning_rate_schedulers/slanted_triangular.py @@ -4,7 +4,9 @@ from overrides import overrides import torch -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) logger = logging.getLogger(__name__) @@ -150,7 +152,10 @@ def get_values(self): if self.freezing_current: # if we are still freezing layers, we restrict the schedule to the current epoch num_steps = actual_num_steps_per_epoch - step = min(self.last_batch_num_total - self.batch_num_total_epoch_end[-1], num_steps) + step = min( + self.last_batch_num_total - self.batch_num_total_epoch_end[-1], + num_steps, + ) else: # otherwise we use the schedule for the rest of training if not self.gradual_unfreezing: diff --git a/allennlp/training/metrics/auc.py b/allennlp/training/metrics/auc.py index 154477a463c..363a44b3fee 100644 --- a/allennlp/training/metrics/auc.py +++ b/allennlp/training/metrics/auc.py @@ -78,10 +78,12 @@ def __call__( self._all_gold_labels = self._all_gold_labels.to(gold_labels.device) self._all_predictions = torch.cat( - [self._all_predictions, torch.masked_select(predictions, mask).float()], dim=0 + [self._all_predictions, torch.masked_select(predictions, mask).float()], + dim=0, ) self._all_gold_labels = torch.cat( - [self._all_gold_labels, torch.masked_select(gold_labels, mask).long()], dim=0 + [self._all_gold_labels, torch.masked_select(gold_labels, mask).long()], + dim=0, ) if is_distributed(): @@ -91,7 +93,8 @@ def __call__( # Check if batch lengths are equal. _all_batch_lengths = [torch.tensor(0) for i in range(world_size)] dist.all_gather( - _all_batch_lengths, torch.tensor(len(self._all_predictions), device=device) + _all_batch_lengths, + torch.tensor(len(self._all_predictions), device=device), ) _all_batch_lengths = [batch_length.item() for batch_length in _all_batch_lengths] diff --git a/allennlp/training/metrics/categorical_accuracy.py b/allennlp/training/metrics/categorical_accuracy.py index a4e434e4b4a..7ced491810c 100644 --- a/allennlp/training/metrics/categorical_accuracy.py +++ b/allennlp/training/metrics/categorical_accuracy.py @@ -85,7 +85,8 @@ def __call__( # ith entry in gold_labels points to index (0-num_classes) for ith row in max_predictions # For each row check if index pointed by gold_label is was 1 or not (among max scored classes) correct = max_predictions_mask[ - torch.arange(gold_labels.numel(), device=gold_labels.device).long(), gold_labels + torch.arange(gold_labels.numel(), device=gold_labels.device).long(), + gold_labels, ].float() tie_counts = max_predictions_mask.sum(-1) correct /= tie_counts.float() diff --git a/allennlp/training/metrics/evalb_bracketing_scorer.py b/allennlp/training/metrics/evalb_bracketing_scorer.py index 074b154acae..55d59d1107c 100644 --- a/allennlp/training/metrics/evalb_bracketing_scorer.py +++ b/allennlp/training/metrics/evalb_bracketing_scorer.py @@ -19,7 +19,11 @@ DEFAULT_EVALB_DIR = os.path.abspath( os.path.join( - os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir, "tools", "EVALB" + os.path.dirname(os.path.realpath(__file__)), + os.pardir, + os.pardir, + "tools", + "EVALB", ) ) diff --git a/allennlp/training/metrics/fbeta_measure.py b/allennlp/training/metrics/fbeta_measure.py index bd8cce644aa..f927d36af76 100644 --- a/allennlp/training/metrics/fbeta_measure.py +++ b/allennlp/training/metrics/fbeta_measure.py @@ -233,7 +233,11 @@ def get_metric(self, reset: bool = False): "fscore": fscore.tolist(), } else: - return {"precision": precision.item(), "recall": recall.item(), "fscore": fscore.item()} + return { + "precision": precision.item(), + "recall": recall.item(), + "fscore": fscore.item(), + } @overrides def reset(self) -> None: diff --git a/allennlp/training/metrics/metric.py b/allennlp/training/metrics/metric.py index 8daab32e14d..b716606c51e 100644 --- a/allennlp/training/metrics/metric.py +++ b/allennlp/training/metrics/metric.py @@ -14,7 +14,10 @@ class Metric(Registrable): supports_distributed = False def __call__( - self, predictions: torch.Tensor, gold_labels: torch.Tensor, mask: Optional[torch.BoolTensor] + self, + predictions: torch.Tensor, + gold_labels: torch.Tensor, + mask: Optional[torch.BoolTensor], ): """ # Parameters diff --git a/allennlp/training/metrics/span_based_f1_measure.py b/allennlp/training/metrics/span_based_f1_measure.py index 17f82600c4b..8c2e41586b8 100644 --- a/allennlp/training/metrics/span_based_f1_measure.py +++ b/allennlp/training/metrics/span_based_f1_measure.py @@ -265,7 +265,9 @@ def get_metric(self, reset: bool = False): all_metrics = {} for tag in all_tags: precision, recall, f1_measure = self._compute_metrics( - self._true_positives[tag], self._false_positives[tag], self._false_negatives[tag] + self._true_positives[tag], + self._false_positives[tag], + self._false_negatives[tag], ) precision_key = "precision" + "-" + tag recall_key = "recall" + "-" + tag diff --git a/allennlp/training/metrics/spearman_correlation.py b/allennlp/training/metrics/spearman_correlation.py index cea2e70e724..b7fc3a5f4fd 100644 --- a/allennlp/training/metrics/spearman_correlation.py +++ b/allennlp/training/metrics/spearman_correlation.py @@ -66,7 +66,8 @@ def __call__( # Check if batch lengths are equal. _all_batch_lengths = [torch.tensor(0) for i in range(world_size)] dist.all_gather( - _all_batch_lengths, torch.tensor(self.total_predictions.shape[0], device=device) + _all_batch_lengths, + torch.tensor(self.total_predictions.shape[0], device=device), ) _all_batch_lengths = [batch_length.item() for batch_length in _all_batch_lengths] diff --git a/allennlp/training/moving_average.py b/allennlp/training/moving_average.py index 205eec973fa..2697b46a4f7 100644 --- a/allennlp/training/moving_average.py +++ b/allennlp/training/moving_average.py @@ -92,7 +92,8 @@ def apply(self, num_updates: Optional[int] = None) -> None: """ if num_updates is not None: decay = min( - self._decay, (self._numerator + num_updates) / (self._denominator + num_updates) + self._decay, + (self._numerator + num_updates) / (self._denominator + num_updates), ) else: decay = self._decay diff --git a/allennlp/training/scheduler.py b/allennlp/training/scheduler.py index 26b115b68ed..46791607690 100644 --- a/allennlp/training/scheduler.py +++ b/allennlp/training/scheduler.py @@ -27,7 +27,10 @@ class Scheduler: """ def __init__( - self, optimizer: torch.optim.Optimizer, param_group_field: str, last_epoch: int = -1 + self, + optimizer: torch.optim.Optimizer, + param_group_field: str, + last_epoch: int = -1, ) -> None: self.optimizer = optimizer self.param_group_field = param_group_field diff --git a/allennlp/training/tensorboard_writer.py b/allennlp/training/tensorboard_writer.py index 7f613afbf9b..4bc045c0d31 100644 --- a/allennlp/training/tensorboard_writer.py +++ b/allennlp/training/tensorboard_writer.py @@ -295,12 +295,25 @@ def log_metrics( # And maybe log to console if log_to_console and val_metric is not None and train_metric is not None: logger.info( - dual_message_template, name.ljust(name_length), train_metric, val_metric + dual_message_template, + name.ljust(name_length), + train_metric, + val_metric, ) elif log_to_console and val_metric is not None: - logger.info(no_train_message_template, name.ljust(name_length), "N/A", val_metric) + logger.info( + no_train_message_template, + name.ljust(name_length), + "N/A", + val_metric, + ) elif log_to_console and train_metric is not None: - logger.info(no_val_message_template, name.ljust(name_length), train_metric, "N/A") + logger.info( + no_val_message_template, + name.ljust(name_length), + train_metric, + "N/A", + ) def enable_activation_logging(self, model: Model) -> None: if self._histogram_interval is not None: diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py index 76bf3e0b0ab..0d259c2a646 100644 --- a/allennlp/training/trainer.py +++ b/allennlp/training/trainer.py @@ -752,8 +752,8 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) - if self._checkpointer is not None: - self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) + if self._checkpointer is not None: + self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) for callback in self._callbacks: callback.on_batch( diff --git a/allennlp/training/util.py b/allennlp/training/util.py index 09feb3481f3..cae03d8675a 100644 --- a/allennlp/training/util.py +++ b/allennlp/training/util.py @@ -113,7 +113,9 @@ def data_loaders_from_params( train_data_path = params.pop("train_data_path") logger.info("Reading training data from %s", train_data_path) data_loaders["train"] = DataLoader.from_params( - data_loader_params.duplicate(), reader=dataset_reader, data_path=train_data_path + data_loader_params.duplicate(), + reader=dataset_reader, + data_path=train_data_path, ) if not validation and not test: @@ -419,7 +421,9 @@ def description_from_metrics(metrics: Dict[str, float]) -> str: def make_vocab_from_params( - params: Params, serialization_dir: Union[str, PathLike], print_statistics: bool = False + params: Params, + serialization_dir: Union[str, PathLike], + print_statistics: bool = False, ) -> Vocabulary: vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) @@ -435,7 +439,10 @@ def make_vocab_from_params( ) # Do a quick sanity check here. There's no need to load any datasets if the vocab # type is "empty" or "from_files". - if datasets_for_vocab_creation is None and vocab_params.get("type") in {"empty", "from_files"}: + if datasets_for_vocab_creation is None and vocab_params.get("type") in { + "empty", + "from_files", + }: datasets_for_vocab_creation = [] data_loaders: Dict[str, DataLoader] diff --git a/scripts/ai2_internal/resume_daemon.py b/scripts/ai2_internal/resume_daemon.py index 76b29d1796b..ced34f31255 100644 --- a/scripts/ai2_internal/resume_daemon.py +++ b/scripts/ai2_internal/resume_daemon.py @@ -145,7 +145,12 @@ def get_status(self, experiment_id: str) -> BeakerStatus: return status def resume(self, experiment_id: str) -> str: - command = ["beaker", "experiment", "resume", f"--experiment-name={experiment_id}"] + command = [ + "beaker", + "experiment", + "resume", + f"--experiment-name={experiment_id}", + ] # Small delay to avoid thrashing Beaker. time.sleep(BEAKER_QUERY_INTERVAL_SECONDS) return subprocess.check_output(command, universal_newlines=True).strip() @@ -206,7 +211,12 @@ def resume(connection: Connection, beaker: BeakerWrapper) -> None: ) cursor.execute( "INSERT INTO active_experiments VALUES (?, ?, ?, ?)", - (new_experiment_id, original_id, max_resumes, current_resume + 1), + ( + new_experiment_id, + original_id, + max_resumes, + current_resume + 1, + ), ) connection.commit() else: diff --git a/scripts/ai2_internal/run_with_beaker.py b/scripts/ai2_internal/run_with_beaker.py index 262dfde4f3f..b5ce82bf482 100755 --- a/scripts/ai2_internal/run_with_beaker.py +++ b/scripts/ai2_internal/run_with_beaker.py @@ -15,7 +15,8 @@ random_int = random.randint(0, 2 ** 32) sys.path.insert( - 0, os.path.dirname(os.path.abspath(os.path.join(os.path.join(__file__, os.pardir), os.pardir))) + 0, + os.path.dirname(os.path.abspath(os.path.join(os.path.join(__file__, os.pardir), os.pardir))), ) from allennlp.common.params import Params @@ -62,17 +63,25 @@ def main(param_file: str, args: argparse.Namespace): print("Create a Beaker image...") image = subprocess.check_output( - f"beaker image create --quiet {docker_image}", shell=True, universal_newlines=True + f"beaker image create --quiet {docker_image}", + shell=True, + universal_newlines=True, ).strip() print(f" Image created: {docker_image}") config_dataset_id = subprocess.check_output( - f"beaker dataset create --quiet {params_dir}/*", shell=True, universal_newlines=True + f"beaker dataset create --quiet {params_dir}/*", + shell=True, + universal_newlines=True, ).strip() # Arguments that differ between preemptible and regular machine execution. if args.preemptible: - allennlp_prefix = ["/stage/allennlp/resumable_train.sh", "/output", "/config/config.json"] + allennlp_prefix = [ + "/stage/allennlp/resumable_train.sh", + "/output", + "/config/config.json", + ] else: allennlp_prefix = [ "python", @@ -132,7 +141,14 @@ def main(param_file: str, args: argparse.Namespace): output.write(json.dumps(config, indent=4)) print(f"Beaker spec written to {output_path}.") - experiment_command = ["beaker", "experiment", "create", "--quiet", "--file", output_path] + experiment_command = [ + "beaker", + "experiment", + "create", + "--quiet", + "--file", + output_path, + ] if args.name: experiment_command.append("--name") experiment_command.append(args.name.replace(" ", "-")) @@ -175,10 +191,14 @@ def resume_command(experiment_id): parser.add_argument("param_file", type=str, help="The model configuration file.") parser.add_argument("--name", type=str, help="A name for the experiment.") parser.add_argument( - "--spec_output_path", type=str, help="The destination to write the experiment spec." + "--spec_output_path", + type=str, + help="The destination to write the experiment spec.", ) parser.add_argument( - "--dry-run", action="store_true", help="If specified, an experiment will not be created." + "--dry-run", + action="store_true", + help="If specified, an experiment will not be created.", ) parser.add_argument( "--image", type=str, help="The image to use (if unspecified one will be built)" @@ -198,11 +218,15 @@ def resume_command(experiment_id): ) parser.add_argument("--cpu", help="CPUs to reserve for this experiment (e.g., 0.5)") parser.add_argument( - "--gpu-count", default=1, help="GPUs to use for this experiment (e.g., 1 (default))" + "--gpu-count", + default=1, + help="GPUs to use for this experiment (e.g., 1 (default))", ) parser.add_argument("--memory", help="Memory to reserve for this experiment (e.g., 1GB)") parser.add_argument( - "--preemptible", action="store_true", help="Allow task to run on preemptible hardware" + "--preemptible", + action="store_true", + help="Allow task to run on preemptible hardware", ) parser.add_argument( "--max-resumes", diff --git a/scripts/build_docs_config.py b/scripts/build_docs_config.py index 36f1105e385..67e4296535e 100644 --- a/scripts/build_docs_config.py +++ b/scripts/build_docs_config.py @@ -22,7 +22,8 @@ def parse_args(): parser.add_argument("source_yaml", help="Path to the mkdocs skeleton config file.") parser.add_argument("docs_root", help="The root of the markdown docs folder.") parser.add_argument( - "api_docs_path", help="The root of the API docs within the markdown docs root folder." + "api_docs_path", + help="The root of the API docs within the markdown docs root folder.", ) parser.add_argument("--docs-version", type=str, default=f"v{VERSION}") return parser.parse_args() diff --git a/scripts/close_stale_issues.py b/scripts/close_stale_issues.py index d84062099e6..7612ad75aa0 100644 --- a/scripts/close_stale_issues.py +++ b/scripts/close_stale_issues.py @@ -4,7 +4,12 @@ from github import Github -LABELS_TO_EXEMPT = ["contributions welcome", "merge when ready", "under development", "help wanted"] +LABELS_TO_EXEMPT = [ + "contributions welcome", + "merge when ready", + "under development", + "help wanted", +] def main(): diff --git a/scripts/py2md.py b/scripts/py2md.py index 798febaa6df..c6baf3ba6d1 100755 --- a/scripts/py2md.py +++ b/scripts/py2md.py @@ -333,7 +333,10 @@ def _format_function_signature( > 60 ): signature_args = ",\n ".join( - filter(lambda s: s.strip() not in ("", ","), (str(arg) for arg in func.args)) + filter( + lambda s: s.strip() not in ("", ","), + (str(arg) for arg in func.args), + ) ) parts.append("(\n " + signature_args + "\n)") else: diff --git a/scripts/train_fixtures.py b/scripts/train_fixtures.py index bbb5eb6246a..a7e0920b761 100755 --- a/scripts/train_fixtures.py +++ b/scripts/train_fixtures.py @@ -54,7 +54,8 @@ def train_fixture_gpu(config_prefix: str) -> None: # now copy back the weights and and archived model shutil.copy(os.path.join(tempdir, "best.th"), os.path.join(serialization_dir, "best_gpu.th")) shutil.copy( - os.path.join(tempdir, "model.tar.gz"), os.path.join(serialization_dir, "model_gpu.tar.gz") + os.path.join(tempdir, "model.tar.gz"), + os.path.join(serialization_dir, "model_gpu.tar.gz"), ) diff --git a/setup.py b/setup.py index 72a99236b89..1026c144a27 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,7 @@ "lmdb", "more-itertools", ], + extras_require={"deepspeed": ["deepspeed>=0.3.7"]}, entry_points={"console_scripts": ["allennlp=allennlp.__main__:run"]}, include_package_data=True, python_requires=">=3.6.1", diff --git a/tests/commands/cached_path_test.py b/tests/commands/cached_path_test.py index 1a293d64044..12e9dfb325c 100644 --- a/tests/commands/cached_path_test.py +++ b/tests/commands/cached_path_test.py @@ -8,13 +8,25 @@ class TestCachedPathCommand(AllenNlpTestCase): def test_local_file(self, capsys): - sys.argv = ["allennlp", "cached-path", "--cache-dir", str(self.TEST_DIR), "README.md"] + sys.argv = [ + "allennlp", + "cached-path", + "--cache-dir", + str(self.TEST_DIR), + "README.md", + ] main() captured = capsys.readouterr() assert "README.md" in captured.out def test_inspect_empty_cache(self, capsys): - sys.argv = ["allennlp", "cached-path", "--cache-dir", str(self.TEST_DIR), "--inspect"] + sys.argv = [ + "allennlp", + "cached-path", + "--cache-dir", + str(self.TEST_DIR), + "--inspect", + ] main() captured = capsys.readouterr() assert "Cached resources:" in captured.out diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py index 8ad4e624df5..8c93159105b 100644 --- a/tests/commands/evaluate_test.py +++ b/tests/commands/evaluate_test.py @@ -127,7 +127,13 @@ def test_evaluate_works_with_vocab_expansion(self): embedding_sources_mapping = json.dumps( {"_text_field_embedder.token_embedder_tokens": embeddings_filename} ) - kebab_args = ["evaluate", archive_path, evaluate_data_path, "--cuda-device", "-1"] + kebab_args = [ + "evaluate", + archive_path, + evaluate_data_path, + "--cuda-device", + "-1", + ] # TODO(mattg): the unawarded_embeddings.gz file above doesn't exist, but this test still # passes. This suggests that vocab extension in evaluate isn't currently doing anything, diff --git a/tests/commands/find_learning_rate_test.py b/tests/commands/find_learning_rate_test.py index f33cdb1f924..d9b92ada286 100644 --- a/tests/commands/find_learning_rate_test.py +++ b/tests/commands/find_learning_rate_test.py @@ -36,7 +36,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), @@ -111,7 +116,12 @@ def test_find_learning_rate_args(self): FindLearningRate().add_subparser(subparsers) for serialization_arg in ["-s", "--serialization-dir"]: - raw_args = ["find-lr", "path/to/params", serialization_arg, "serialization_dir"] + raw_args = [ + "find-lr", + "path/to/params", + serialization_arg, + "serialization_dir", + ] args = parser.parse_args(raw_args) @@ -160,7 +170,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), diff --git a/tests/commands/main_test.py b/tests/commands/main_test.py index 5931407bf14..f7fb45f08c0 100644 --- a/tests/commands/main_test.py +++ b/tests/commands/main_test.py @@ -108,7 +108,13 @@ def test_other_modules(self): serialization_dir = self.TEST_DIR / "serialization" # Run train with using the non-allennlp module. - sys.argv = ["allennlp", "train", str(config_path), "-s", str(serialization_dir)] + sys.argv = [ + "allennlp", + "train", + str(config_path), + "-s", + str(serialization_dir), + ] # Shouldn't be able to find the model. with pytest.raises(ConfigurationError): diff --git a/tests/commands/predict_test.py b/tests/commands/predict_test.py index 644ce442d00..ec7898f3c26 100644 --- a/tests/commands/predict_test.py +++ b/tests/commands/predict_test.py @@ -85,7 +85,13 @@ def test_works_with_known_model(self): assert len(results) == 2 for result in results: - assert set(result.keys()) == {"label", "logits", "probs", "tokens", "token_ids"} + assert set(result.keys()) == { + "label", + "logits", + "probs", + "tokens", + "token_ids", + } shutil.rmtree(self.tempdir) @@ -111,7 +117,14 @@ def test_using_dataset_reader_works_with_known_model(self): assert len(results) == 3 for result in results: - assert set(result.keys()) == {"label", "logits", "loss", "probs", "tokens", "token_ids"} + assert set(result.keys()) == { + "label", + "logits", + "loss", + "probs", + "tokens", + "token_ids", + } shutil.rmtree(self.tempdir) @@ -284,7 +297,13 @@ def test_batch_prediction_works_with_known_model(self): assert len(results) == 2 for result in results: - assert set(result.keys()) == {"label", "logits", "probs", "tokens", "token_ids"} + assert set(result.keys()) == { + "label", + "logits", + "probs", + "tokens", + "token_ids", + } shutil.rmtree(self.tempdir) @@ -458,7 +477,13 @@ def test_other_modules(self): assert len(results) == 2 # Overridden predictor should output extra field for result in results: - assert set(result.keys()) == {"label", "logits", "probs", "tokens", "token_ids"} + assert set(result.keys()) == { + "label", + "logits", + "probs", + "tokens", + "token_ids", + } def test_alternative_file_formats(self): @Predictor.register("classification-csv") diff --git a/tests/commands/print_results_test.py b/tests/commands/print_results_test.py index 3628c66e4bd..f68198ef1f3 100644 --- a/tests/commands/print_results_test.py +++ b/tests/commands/print_results_test.py @@ -28,10 +28,12 @@ def setup_method(self): open(os.path.join(self.directory1 / "metrics.json"), "w+"), ) json.dump( - {"train": 4, "dev": 5}, open(os.path.join(self.directory2 / "metrics.json"), "w+") + {"train": 4, "dev": 5}, + open(os.path.join(self.directory2 / "metrics.json"), "w+"), ) json.dump( - {"train": 6, "dev": 7}, open(os.path.join(self.directory3 / "cool_metrics.json"), "w+") + {"train": 6, "dev": 7}, + open(os.path.join(self.directory3 / "cool_metrics.json"), "w+"), ) def test_print_results(self): diff --git a/tests/commands/train_test.py b/tests/commands/train_test.py index 11156f19cef..462f6fb93d0 100644 --- a/tests/commands/train_test.py +++ b/tests/commands/train_test.py @@ -13,7 +13,12 @@ import pytest import torch -from allennlp.commands.train import Train, train_model, train_model_from_args, TrainModel +from allennlp.commands.train import ( + Train, + train_model, + train_model_from_args, + TrainModel, +) from allennlp.common import Params from allennlp.common.checks import ConfigurationError from allennlp.common.testing import AllenNlpTestCase, cpu_or_gpu, requires_multi_gpu @@ -82,7 +87,12 @@ class TestTrain(AllenNlpTestCase): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -115,7 +125,10 @@ def test_train_model(self): # It's also not OK if serialization dir is a real serialization dir: with pytest.raises(ConfigurationError): - train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, "test_train_model")) + train_model( + params(), + serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), + ) # But it's OK if serialization dir exists and --recover is specified: train_model( @@ -126,7 +139,9 @@ def test_train_model(self): # It's ok serialization dir exists and --force is specified (it will be deleted): train_model( - params(), serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), force=True + params(), + serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), + force=True, ) # But --force and --recover cannot both be specified @@ -167,7 +182,10 @@ def test_force_gpu(self): _seen_training_devices.clear() if torch.cuda.device_count() == 0: with pytest.raises(ConfigurationError): - train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_force_gpu")) + train_model( + params, + serialization_dir=os.path.join(self.TEST_DIR, "test_force_gpu"), + ) else: train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_force_gpu")) assert len(_seen_training_devices) == 1 @@ -203,7 +221,12 @@ def test_train_model_distributed(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -251,7 +274,12 @@ def test_train_model_deepspeed(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -260,11 +288,13 @@ def test_train_model_deepspeed(self): "trainer": { "type": "deepspeed", "deepspeed_config": { - "zero_optimization": { "stage": 2 }, - "fp16": { "enabled": True, }, + "zero_optimization": {"stage": 2}, + "fp16": { + "enabled": True, + }, }, - "num_epochs": 2, - "optimizer": "adam" + "num_epochs": 2, + "optimizer": "adam", }, "distributed": {"cuda_devices": devices}, } @@ -308,9 +338,17 @@ def test_train_model_distributed_with_sharded_reader(self, max_instances_in_memo "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, + }, + "dataset_reader": { + "type": "sharded", + "base_reader": {"type": "sequence_tagging"}, }, - "dataset_reader": {"type": "sharded", "base_reader": {"type": "sequence_tagging"}}, "train_data_path": SEQUENCE_TAGGING_SHARDS_PATH, "validation_data_path": SEQUENCE_TAGGING_SHARDS_PATH, "data_loader": { @@ -397,7 +435,12 @@ def test_train_model_distributed_without_sharded_reader(self, max_instances_in_m "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -479,7 +522,12 @@ def test_distributed_raises_error_with_no_gpus(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -490,7 +538,10 @@ def test_distributed_raises_error_with_no_gpus(self): } ) with pytest.raises(ConfigurationError): - train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_train_model")) + train_model( + params, + serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), + ) def test_train_saves_all_keys_in_config(self): params = Params( @@ -500,7 +551,12 @@ def test_train_saves_all_keys_in_config(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "pytorch_seed": 42, "numpy_seed": 42, @@ -532,7 +588,12 @@ def test_error_is_throw_when_cuda_device_is_not_available(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": "test_fixtures/data/sequence_tagging.tsv", @@ -547,7 +608,10 @@ def test_error_is_throw_when_cuda_device_is_not_available(self): ) with pytest.raises(ConfigurationError, match="Experiment specified"): - train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_train_model")) + train_model( + params, + serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), + ) def test_train_with_test_set(self): params = Params( @@ -557,7 +621,12 @@ def test_train_with_test_set(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -611,7 +680,12 @@ def on_batch( "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -628,7 +702,8 @@ def on_batch( } ) train_model( - params.duplicate(), serialization_dir=os.path.join(self.TEST_DIR, "train_normal") + params.duplicate(), + serialization_dir=os.path.join(self.TEST_DIR, "train_normal"), ) assert batch_callback_counter == last_num_steps_per_epoch * number_of_epochs batch_callback_counter = 0 @@ -637,7 +712,8 @@ def on_batch( original_batch_size = params["data_loader"]["batch_size"] params["data_loader"]["batch_size"] = 1 train_model( - params.duplicate(), serialization_dir=os.path.join(self.TEST_DIR, "train_with_bs1") + params.duplicate(), + serialization_dir=os.path.join(self.TEST_DIR, "train_with_bs1"), ) assert batch_callback_counter == last_num_steps_per_epoch * number_of_epochs batch_callback_counter = 0 @@ -656,7 +732,12 @@ def test_train_args(self): Train().add_subparser(subparsers) for serialization_arg in ["-s", "--serialization-dir"]: - raw_args = ["train", "path/to/params", serialization_arg, "serialization_dir"] + raw_args = [ + "train", + "path/to/params", + serialization_arg, + "serialization_dir", + ] args = parser.parse_args(raw_args) @@ -679,7 +760,10 @@ def test_train_model_can_instantiate_from_params(self): # Can instantiate from base class params TrainModel.from_params( - params=params, serialization_dir=self.TEST_DIR, local_rank=0, batch_weight_key="" + params=params, + serialization_dir=self.TEST_DIR, + local_rank=0, + batch_weight_key="", ) def test_train_can_fine_tune_model_from_archive(self): @@ -687,7 +771,10 @@ def test_train_can_fine_tune_model_from_archive(self): self.FIXTURES_ROOT / "basic_classifier" / "experiment_from_archive.jsonnet" ) train_loop = TrainModel.from_params( - params=params, serialization_dir=self.TEST_DIR, local_rank=0, batch_weight_key="" + params=params, + serialization_dir=self.TEST_DIR, + local_rank=0, + batch_weight_key="", ) train_loop.run() @@ -708,7 +795,12 @@ def test_train_nograd_regex(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -718,7 +810,11 @@ def test_train_nograd_regex(self): } ) serialization_dir = os.path.join(self.TEST_DIR, "test_train_nograd") - regex_lists = [[], [".*text_field_embedder.*"], [".*text_field_embedder.*", ".*encoder.*"]] + regex_lists = [ + [], + [".*text_field_embedder.*"], + [".*text_field_embedder.*", ".*encoder.*"], + ] for regex_list in regex_lists: params = params_get() params["trainer"]["no_grad"] = regex_list @@ -750,7 +846,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), @@ -787,7 +888,16 @@ def test_dry_run_makes_vocab(self): tokens = [line.strip() for line in f] tokens.sort() - assert tokens == [".", "@@UNKNOWN@@", "animals", "are", "birds", "cats", "dogs", "snakes"] + assert tokens == [ + ".", + "@@UNKNOWN@@", + "animals", + "are", + "birds", + "cats", + "dogs", + "snakes", + ] with open(vocab_path / "labels.txt") as f: labels = [line.strip() for line in f] diff --git a/tests/common/file_utils_test.py b/tests/common/file_utils_test.py index b99636c4f14..1a8eeb0f90d 100644 --- a/tests/common/file_utils_test.py +++ b/tests/common/file_utils_test.py @@ -121,13 +121,20 @@ def mocked_http_etag(url: str): url = "https://github.com/allenai/allennlp/blob/master/some-fake-resource" # We'll create two cached versions of this fake resource using two different etags. - etags = ['W/"3e5885bfcbf4c47bc4ee9e2f6e5ea916"', 'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea918"'] + etags = [ + 'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea916"', + 'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea918"', + ] filenames = [ os.path.join(self.TEST_DIR, _resource_to_filename(url, etag)) for etag in etags ] for filename, etag in zip(filenames, etags): meta = _Meta( - resource=url, cached_path=filename, creation_time=time.time(), etag=etag, size=2341 + resource=url, + cached_path=filename, + creation_time=time.time(), + etag=etag, + size=2341, ) meta.to_file() with open(filename, "w") as f: @@ -216,7 +223,10 @@ def test_resource_to_filename_with_etags_eliminates_quotes(self): def test_split_s3_path(self): # Test splitting good urls. - assert _split_s3_path("s3://my-bucket/subdir/file.txt") == ("my-bucket", "subdir/file.txt") + assert _split_s3_path("s3://my-bucket/subdir/file.txt") == ( + "my-bucket", + "subdir/file.txt", + ) assert _split_s3_path("s3://my-bucket/file.txt") == ("my-bucket", "file.txt") # Test splitting bad urls. @@ -390,7 +400,9 @@ def test_remove_entries(self): ) self.create_cache_entry("http://other.fake.datastore.com/glove.txt.gz", "etag-4") self.create_cache_entry( - "http://other.fake.datastore.com/glove.txt.gz", "etag-5", as_extraction_dir=True + "http://other.fake.datastore.com/glove.txt.gz", + "etag-5", + as_extraction_dir=True, ) reclaimed_space = remove_cache_entries(["http://fake.*"], cache_dir=self.TEST_DIR) @@ -414,11 +426,13 @@ def setup_method(self): super().setup_method() self.tar_file = self.TEST_DIR / "utf-8.tar.gz" shutil.copyfile( - self.FIXTURES_ROOT / "utf-8_sample" / "archives" / "utf-8.tar.gz", self.tar_file + self.FIXTURES_ROOT / "utf-8_sample" / "archives" / "utf-8.tar.gz", + self.tar_file, ) self.zip_file = self.TEST_DIR / "utf-8.zip" shutil.copyfile( - self.FIXTURES_ROOT / "utf-8_sample" / "archives" / "utf-8.zip", self.zip_file + self.FIXTURES_ROOT / "utf-8_sample" / "archives" / "utf-8.zip", + self.zip_file, ) def check_extracted(self, extracted: str): diff --git a/tests/common/from_params_test.py b/tests/common/from_params_test.py index dfd479ffe3d..cbb5a012fbd 100644 --- a/tests/common/from_params_test.py +++ b/tests/common/from_params_test.py @@ -4,7 +4,12 @@ import torch from allennlp.common import Lazy, Params, Registrable -from allennlp.common.from_params import FromParams, takes_arg, remove_optional, create_kwargs +from allennlp.common.from_params import ( + FromParams, + takes_arg, + remove_optional, + create_kwargs, +) from allennlp.common.testing import AllenNlpTestCase from allennlp.data import DataLoader, DatasetReader, Tokenizer from allennlp.models import Model @@ -344,7 +349,10 @@ def __init__(self, a: Union[float, int]) -> None: int_param_str = '{"a": 1}' import json - for expected_type, param_str in [(int, int_param_str), (float, float_param_str)]: + for expected_type, param_str in [ + (int, int_param_str), + (float, float_param_str), + ]: for cls in [IntFloat, FloatInt]: c = cls.from_params(Params(json.loads(param_str))) assert type(c.a) == expected_type @@ -382,7 +390,10 @@ def __init__(self, items: Dict[str, A]) -> None: params = Params( { "type": "d", - "items": {"first": {"type": "b", "size": 1}, "second": {"type": "b", "size": 2}}, + "items": { + "first": {"type": "b", "size": 1}, + "second": {"type": "b", "size": 2}, + }, } ) d = C.from_params(params) @@ -465,7 +476,10 @@ def __init__(self, items: Tuple[A, C]) -> None: self.items = items params = Params( - {"type": "f", "items": [{"type": "b", "size": 1}, {"type": "d", "name": "item2"}]} + { + "type": "f", + "items": [{"type": "b", "size": 1}, {"type": "d", "name": "item2"}], + } ) f = E.from_params(params) @@ -832,7 +846,10 @@ def __init__(self, items: Mapping[str, A]) -> None: params = Params( { "type": "d", - "items": {"first": {"type": "b", "size": 1}, "second": {"type": "b", "size": 2}}, + "items": { + "first": {"type": "b", "size": 1}, + "second": {"type": "b", "size": 2}, + }, } ) d = C.from_params(params) @@ -1018,7 +1035,8 @@ def __init__(self, a: int, b: str = None, **kwargs) -> None: assert foo.c is None foo = Bar.from_params( - params=Params({"type": "foo", "a": 2, "b": "hi", "c": {"2": "3"}}), extra="4" + params=Params({"type": "foo", "a": 2, "b": "hi", "c": {"2": "3"}}), + extra="4", ) assert foo.a == 2 assert foo.b == "hi" diff --git a/tests/common/params_test.py b/tests/common/params_test.py index ba7b5996a4e..c67490dbea5 100644 --- a/tests/common/params_test.py +++ b/tests/common/params_test.py @@ -65,7 +65,10 @@ def test_overrides(self, input_type): def test_unflatten(self): flattened = {"a.b.c": 1, "a.b.d": 0, "a.e.f.g.h": 2, "b": 3} unflattened = unflatten(flattened) - assert unflattened == {"a": {"b": {"c": 1, "d": 0}, "e": {"f": {"g": {"h": 2}}}}, "b": 3} + assert unflattened == { + "a": {"b": {"c": 1, "d": 0}, "e": {"f": {"g": {"h": 2}}}}, + "b": 3, + } # should do nothing to a non-flat dictionary assert unflatten(unflattened) == unflattened diff --git a/tests/common/util_test.py b/tests/common/util_test.py index 60ff5551690..196b61a23cf 100644 --- a/tests/common/util_test.py +++ b/tests/common/util_test.py @@ -38,8 +38,20 @@ def test_lazy_groups_of(self): def test_pad_sequence_to_length(self): assert util.pad_sequence_to_length([1, 2, 3], 5) == [1, 2, 3, 0, 0] - assert util.pad_sequence_to_length([1, 2, 3], 5, default_value=lambda: 2) == [1, 2, 3, 2, 2] - assert util.pad_sequence_to_length([1, 2, 3], 5, padding_on_right=False) == [0, 0, 1, 2, 3] + assert util.pad_sequence_to_length([1, 2, 3], 5, default_value=lambda: 2) == [ + 1, + 2, + 3, + 2, + 2, + ] + assert util.pad_sequence_to_length([1, 2, 3], 5, padding_on_right=False) == [ + 0, + 0, + 1, + 2, + 3, + ] def test_namespace_match(self): assert util.namespace_match("*tags", "tags") diff --git a/tests/data/data_loaders/multiprocess_data_loader_test.py b/tests/data/data_loaders/multiprocess_data_loader_test.py index e0197edee71..32fe62bae82 100644 --- a/tests/data/data_loaders/multiprocess_data_loader_test.py +++ b/tests/data/data_loaders/multiprocess_data_loader_test.py @@ -29,7 +29,9 @@ class MockDatasetReader(DatasetReader): def __init__(self, model: str = "epwalsh/bert-xsmall-dummy", **kwargs) -> None: super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + manual_distributed_sharding=True, + manual_multiprocess_sharding=True, + **kwargs, ) self.tokenizer = PretrainedTransformerTokenizer(model) self.token_indexers = {"tokens": PretrainedTransformerIndexer(model)} @@ -104,7 +106,12 @@ def test_error_raised_when_text_fields_contain_token_indexers(max_instances_in_m [ dict(max_instances_in_memory=10, num_workers=2, batch_size=1), dict(num_workers=2, batch_size=1), - dict(max_instances_in_memory=10, num_workers=2, start_method="spawn", batch_size=1), + dict( + max_instances_in_memory=10, + num_workers=2, + start_method="spawn", + batch_size=1, + ), dict(num_workers=2, start_method="spawn", batch_size=1), dict(max_instances_in_memory=10, num_workers=0, batch_size=1), dict(num_workers=0, batch_size=1), diff --git a/tests/data/data_loaders/multitask_data_loader_test.py b/tests/data/data_loaders/multitask_data_loader_test.py index 35b28dfb721..0b69077b77c 100644 --- a/tests/data/data_loaders/multitask_data_loader_test.py +++ b/tests/data/data_loaders/multitask_data_loader_test.py @@ -9,7 +9,10 @@ from allennlp.data.dataset_readers import MultiTaskDatasetReader from allennlp.data.data_loaders.multitask_data_loader import MultiTaskDataLoader from allennlp.data.data_loaders.multitask_scheduler import RoundRobinScheduler -from allennlp.data.data_loaders.multitask_epoch_sampler import UniformSampler, WeightedSampler +from allennlp.data.data_loaders.multitask_epoch_sampler import ( + UniformSampler, + WeightedSampler, +) class FakeDatasetReaderA(DatasetReader): diff --git a/tests/data/dataloader_test.py b/tests/data/dataloader_test.py new file mode 100644 index 00000000000..cb422f61945 --- /dev/null +++ b/tests/data/dataloader_test.py @@ -0,0 +1,46 @@ +from typing import Iterable + +import pytest + +from allennlp.data.fields import LabelField +from allennlp.data.instance import Instance +from allennlp.data.dataloader import PyTorchDataLoader +from allennlp.data.dataset_readers.dataset_reader import DatasetReader + + +@pytest.mark.parametrize("lazy", (True, False)) +def test_loader_uses_all_instances_when_batches_per_epochs_set(lazy): + NUM_INSTANCES = 20 + BATCH_SIZE = 2 + BATCHES_PER_EPOCH = 3 + EPOCHS = 4 + + class FakeDatasetReader(DatasetReader): + def _read(self, filename: str) -> Iterable[Instance]: + for i in range(NUM_INSTANCES): + yield Instance({"index": LabelField(i, skip_indexing=True)}) + + reader = FakeDatasetReader(lazy=lazy) + dataset = reader.read("blah") + + loader = PyTorchDataLoader(dataset, batch_size=BATCH_SIZE, batches_per_epoch=BATCHES_PER_EPOCH) + epoch_batches = [] + for epoch in range(EPOCHS): + batches = [] + for batch in loader: + instances = [] + for index in batch["index"]: + instances.append(index) + batches.append(instances) + epoch_batches.append(batches) + + assert epoch_batches == [ + # Epoch 0. + [[0, 1], [2, 3], [4, 5]], + # Epoch 1. + [[6, 7], [8, 9], [10, 11]], + # Epoch 2. + [[12, 13], [14, 15], [16, 17]], + # Epoch 3. + [[18, 19], [0, 1], [2, 3]], + ] diff --git a/tests/data/dataset_readers/babi_reader_test.py b/tests/data/dataset_readers/babi_reader_test.py index 687f548ed10..3fcf244e652 100644 --- a/tests/data/dataset_readers/babi_reader_test.py +++ b/tests/data/dataset_readers/babi_reader_test.py @@ -1,15 +1,18 @@ import pytest from allennlp.common import Params +from allennlp.common.util import ensure_list from allennlp.data.dataset_readers import BabiReader from allennlp.common.testing import AllenNlpTestCase class TestBAbIReader: - @pytest.mark.parametrize("keep_sentences", [False, True]) - def test_read_from_file(self, keep_sentences): - reader = BabiReader(keep_sentences=keep_sentences) - instances = list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "babi.txt")) + @pytest.mark.parametrize( + "keep_sentences, lazy", [(False, False), (False, True), (True, False), (True, True)] + ) + def test_read_from_file(self, keep_sentences, lazy): + reader = BabiReader(keep_sentences=keep_sentences, lazy=lazy) + instances = ensure_list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "babi.txt")) assert len(instances) == 8 if keep_sentences: diff --git a/tests/data/dataset_readers/dataset_reader_test.py b/tests/data/dataset_readers/dataset_reader_test.py index aedebefded3..22eab7abaeb 100644 --- a/tests/data/dataset_readers/dataset_reader_test.py +++ b/tests/data/dataset_readers/dataset_reader_test.py @@ -1,85 +1,287 @@ -from itertools import islice -from typing import Optional, List, Set +from collections import deque +import os +import shutil +from typing import Optional, NamedTuple, List +from filelock import FileLock import pytest import torch.distributed as dist +from allennlp.common.testing import AllenNlpTestCase from allennlp.common import util as common_util +from allennlp.common.checks import ConfigurationError from allennlp.data import Instance +from allennlp.data.dataloader import PyTorchDataLoader from allennlp.data.dataset_readers import ( + dataset_reader, DatasetReader, - WorkerInfo, + TextClassificationJsonReader, ) +from allennlp.data.dataset_readers.dataset_reader import AllennlpLazyDataset from allennlp.data.fields import LabelField -TOTAL_INSTANCES = 100 +def mock_collate_fn(item): + return item[0] -class MockDatasetReader(DatasetReader): - def _read(self, file_path): - for i in range(TOTAL_INSTANCES): - yield self.text_to_instance(i) +class TestDatasetReader(AllenNlpTestCase): + def setup_method(self): + super().setup_method() + self.cache_directory = str(AllenNlpTestCase.FIXTURES_ROOT / "data_cache" / "with_prefix") - def text_to_instance(self, index: int): # type: ignore - return Instance({"index": LabelField(index, skip_indexing=True)}) + def teardown_method(self): + super().teardown_method() + if os.path.exists(self.cache_directory): + shutil.rmtree(self.cache_directory) + def test_lazy_dataset_can_be_iterated_through_multiple_times(self): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(lazy=True) + instances = reader.read(data_file) + assert isinstance(instances, AllennlpLazyDataset) -class MockMmpsDatasetReader(DatasetReader): - """ - Implements manual multi-process sharding (MMPS). - """ + first_pass_instances = list(instances) + assert len(first_pass_instances) > 2 + second_pass_instances = list(instances) + assert first_pass_instances == second_pass_instances - def __init__(self, **kwargs) -> None: - super().__init__(manual_multiprocess_sharding=True, **kwargs) + def test_read_only_creates_cache_file_once(self): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(cache_directory=self.cache_directory) + cache_file = reader._get_cache_location_for_file_path(str(data_file)) - def _read(self, file_path): - start_index = 0 - step_size = 1 - worker_info = self.get_worker_info() - if worker_info is not None: - start_index += worker_info.id - step_size *= worker_info.num_workers - for i in islice(range(TOTAL_INSTANCES), start_index, None, step_size): - yield self.text_to_instance(i) + # The first read will create the cache. + reader.read(data_file) + assert os.path.exists(cache_file) + with open(cache_file, "r") as in_file: + cache_contents = in_file.read() + # The second and all subsequent reads should _use_ the cache, not modify it. I looked + # into checking file modification times, but this test will probably be faster than the + # granularity of `os.path.getmtime()` (which only returns values in seconds). + reader.read(data_file) + reader.read(data_file) + reader.read(data_file) + reader.read(data_file) + with open(cache_file, "r") as in_file: + final_cache_contents = in_file.read() + assert cache_contents == final_cache_contents - def text_to_instance(self, index: int): # type: ignore - return Instance({"index": LabelField(index, skip_indexing=True)}) + @pytest.mark.parametrize("lazy", (True, False)) + def test_caching_works_with_lazy_reading(self, caplog, lazy: bool): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + snli_copy_file = str(data_file) + ".copy" + shutil.copyfile(data_file, snli_copy_file) + reader = TextClassificationJsonReader(lazy=lazy, cache_directory=self.cache_directory) + cache_file = reader._get_cache_location_for_file_path(snli_copy_file) + # The call to read() will give us an _iterator_. We'll iterate over it multiple times, + # and the caching behavior should change as we go. + assert not os.path.exists(cache_file) + instances = reader.read(snli_copy_file) -class MockMdsDatasetReader(DatasetReader): - """ - Implements manual distributed sharding (MDS). - """ + # The first iteration will create the cache + first_pass_instances = [] + for instance in instances: + first_pass_instances.append(instance) + assert "Caching instances to temp file" in " ".join([rec.message for rec in caplog.records]) + assert os.path.exists(cache_file) - def __init__(self, **kwargs) -> None: - super().__init__(manual_distributed_sharding=True, **kwargs) + # Now we _remove_ the data file, to be sure we're reading from the cache. + os.remove(snli_copy_file) + caplog.clear() + instances = reader.read(snli_copy_file) + second_pass_instances = [] + for instance in instances: + second_pass_instances.append(instance) + assert "Reading instances from cache" in " ".join([rec.message for rec in caplog.records]) - def _read(self, file_path): - start_index = 0 - step_size = 1 - if common_util.is_distributed(): - start_index += dist.get_rank() - step_size *= dist.get_world_size() - for i in islice(range(TOTAL_INSTANCES), start_index, None, step_size): - yield self.text_to_instance(i) + # We should get the same instances both times. + assert len(first_pass_instances) == len(second_pass_instances) + for instance, cached_instance in zip(first_pass_instances, second_pass_instances): + assert instance.fields == cached_instance.fields - def text_to_instance(self, index: int): # type: ignore - return Instance({"index": LabelField(index, skip_indexing=True)}) + # And just to be super paranoid, in case the second pass somehow bypassed the cache + # because of a bug that's hard to detect, we'll read the + # instances from the cache with a non-lazy iterator and make sure they're the same. + reader = TextClassificationJsonReader(lazy=False, cache_directory=self.cache_directory) + cached_instances = reader.read(snli_copy_file) + assert len(first_pass_instances) == len(cached_instances) + for instance, cached_instance in zip(first_pass_instances, cached_instances): + assert instance.fields == cached_instance.fields + + @pytest.mark.parametrize("lazy", (True, False)) + def test_caching_skipped_when_lock_not_acquired(self, caplog, lazy: bool): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(lazy=lazy, cache_directory=self.cache_directory) + reader.CACHE_FILE_LOCK_TIMEOUT = 1 + cache_file = reader._get_cache_location_for_file_path(str(data_file)) + + with FileLock(cache_file + ".lock"): + # Right now we hold the lock on the cache, so the reader shouldn't + # be able to write to it. It will wait for 1 second (because that's what + # we set the timeout to be), and then just read the instances as normal. + caplog.clear() + instances = list(reader.read(data_file)) + assert "Failed to acquire lock" in caplog.text + assert instances + + # We didn't write to the cache because we couldn't acquire the file lock. + assert not os.path.exists(cache_file) + + # Now we'll write to the cache and then try the same thing again, this + # time making sure that we can still successfully read without the cache + # when the lock can't be acquired. + deque(reader.read(data_file), maxlen=1) + assert os.path.exists(cache_file) + + with FileLock(cache_file + ".lock"): + # Right now we hold the lock on the cache, so the reader shouldn't + # be able to write to it. It will wait for 1 second (because that's what + # we set the timeout to be), and then just read the instances as normal. + caplog.clear() + instances = list(reader.read(data_file)) + assert "Failed to acquire lock" in caplog.text + assert instances + + @pytest.mark.parametrize("lazy", (True, False)) + def test_caching_skipped_with_distributed_training(self, caplog, monkeypatch, lazy): + monkeypatch.setattr(common_util, "is_distributed", lambda: True) + monkeypatch.setattr(dist, "get_rank", lambda: 0) + monkeypatch.setattr(dist, "get_world_size", lambda: 1) + + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(lazy=lazy, cache_directory=self.cache_directory) + cache_file = reader._get_cache_location_for_file_path(str(data_file)) + + deque(reader.read(data_file), maxlen=1) + assert not os.path.exists(cache_file) + assert "Can't cache data instances when there are multiple processes" in caplog.text + + def test_caching_with_lazy_reader_in_multi_process_loader(self): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(lazy=True, cache_directory=self.cache_directory) + deque( + PyTorchDataLoader(reader.read(data_file), collate_fn=mock_collate_fn, num_workers=2), + maxlen=0, + ) + + # We shouldn't write to the cache when the data is being loaded from multiple + # processes. + cache_file = reader._get_cache_location_for_file_path(str(data_file)) + assert not os.path.exists(cache_file) + + # But try again from the main process and we should see the cache file. + instances = list(reader.read(data_file)) + assert instances + assert os.path.exists(cache_file) + + # Reading again from a multi-process loader should read from the cache. + new_instances = list( + PyTorchDataLoader(reader.read(data_file), collate_fn=mock_collate_fn, num_workers=2) + ) + assert len(instances) == len(new_instances) + + @pytest.mark.parametrize("lazy", (True, False)) + def test_max_instances(self, lazy): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(max_instances=2, lazy=lazy) + instances = reader.read(data_file) + instance_count = sum(1 for _ in instances) + assert instance_count == 2 + + @pytest.mark.parametrize("num_workers", (0, 1, 2)) + def test_max_instances_with_multi_process_loader(self, num_workers): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(max_instances=2, lazy=True) + instances = list( + PyTorchDataLoader( + reader.read(data_file), collate_fn=mock_collate_fn, num_workers=num_workers + ) + ) + assert len(instances) == 2 + + @pytest.mark.parametrize("lazy", (True, False)) + def test_cached_max_instances(self, lazy): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + + # If we try reading with max instances, it shouldn't write to the cache. + reader = TextClassificationJsonReader( + cache_directory=self.cache_directory, lazy=lazy, max_instances=2 + ) + instances = list(reader.read(data_file)) + assert len(instances) == 2 + cache_file = reader._get_cache_location_for_file_path(str(data_file)) + assert not os.path.exists(cache_file) -class MockMmpdsDatasetReader(DatasetReader): - """ - Implements manual multi-process and distributed sharding (MMPDS). - """ + # Now reading again with no max_instances specified should create the cache. + reader = TextClassificationJsonReader(cache_directory=self.cache_directory, lazy=lazy) + instances = list(reader.read(data_file)) + assert len(instances) > 2 + assert os.path.exists(cache_file) - def __init__(self, **kwargs) -> None: - super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + # The second read should only return two instances, even though it's from the cache. + reader = TextClassificationJsonReader( + cache_directory=self.cache_directory, max_instances=2, lazy=lazy ) + instances = list(reader.read(data_file)) + assert len(instances) == 2 + + +class MockWorkerInfo(NamedTuple): + id: int + num_workers: int + +class MockDatasetReader(DatasetReader): def _read(self, file_path): - for i in self.shard_iterable(range(TOTAL_INSTANCES)): + for i in range(10): yield self.text_to_instance(i) def text_to_instance(self, index: int): # type: ignore @@ -87,94 +289,94 @@ def text_to_instance(self, index: int): # type: ignore @pytest.mark.parametrize( - "world_size, num_workers, max_instances", + "node_rank, world_size, worker_id, num_workers, max_instances, expected_result", [ - (4, 2, None), - (4, 2, 67), - (4, None, None), - (4, None, None), - (None, 2, None), - (None, 2, 67), - (None, None, None), - (None, None, 67), + (None, None, None, None, None, list(range(10))), + (None, None, None, None, 5, list(range(5))), + (None, None, None, None, 12, list(range(10))), + (None, None, 0, 1, None, list(range(10))), + (None, None, 0, 2, None, [0, 2, 4, 6, 8]), + (None, None, 1, 2, None, [1, 3, 5, 7, 9]), + (None, None, 0, 2, 5, [0, 2, 4]), + (None, None, 1, 2, 5, [1, 3]), + (0, 1, None, None, None, list(range(10))), + (0, 2, None, None, None, [0, 2, 4, 6, 8]), + (1, 2, None, None, None, [1, 3, 5, 7, 9]), + (0, 2, None, None, 5, [0, 2, 4]), + (1, 2, None, None, 5, [1, 3]), + (0, 2, 0, 2, None, [0, 4, 8]), + (0, 2, 1, 2, None, [1, 5, 9]), + (1, 2, 0, 2, None, [2, 6]), + (1, 2, 1, 2, None, [3, 7]), + (0, 2, 0, 2, 5, [0, 4]), ], ) -@pytest.mark.parametrize( - "reader_class", - [MockDatasetReader, MockMmpsDatasetReader, MockMdsDatasetReader, MockMmpdsDatasetReader], -) def test_instance_slicing( monkeypatch, - reader_class, + node_rank: Optional[int], world_size: Optional[int], + worker_id: Optional[int], num_workers: Optional[int], max_instances: Optional[int], + expected_result: List[int], ): - """ - Ensure that the intances read by each worker are always unique and the total - adds up to `max_instances`. - """ - results: List[Set[int]] = [] - - minimum_expected_result_size = max_instances or TOTAL_INSTANCES - maximum_expected_result_size = max_instances or TOTAL_INSTANCES - - if world_size is not None and num_workers is not None: - minimum_expected_result_size //= world_size - minimum_expected_result_size //= num_workers - maximum_expected_result_size = minimum_expected_result_size + 1 - for global_rank in range(world_size): - monkeypatch.setattr(common_util, "is_distributed", lambda: True) - monkeypatch.setattr(dist, "get_rank", lambda: global_rank) - monkeypatch.setattr(dist, "get_world_size", lambda: world_size) - for worker_id in range(num_workers): - reader = reader_class(max_instances=max_instances) - reader._set_worker_info(WorkerInfo(num_workers, worker_id)) - result = set( - x["index"].label for x in reader.read("the-path-doesnt-matter") # type: ignore - ) - results.append(result) - elif world_size is not None: - minimum_expected_result_size //= world_size - maximum_expected_result_size = minimum_expected_result_size + 1 - for global_rank in range(world_size): - monkeypatch.setattr(common_util, "is_distributed", lambda: True) - monkeypatch.setattr(dist, "get_rank", lambda: global_rank) - monkeypatch.setattr(dist, "get_world_size", lambda: world_size) - reader = reader_class(max_instances=max_instances) - result = set( - x["index"].label for x in reader.read("the-path-doesnt-matter") # type: ignore - ) - results.append(result) - elif num_workers is not None: - minimum_expected_result_size //= num_workers - maximum_expected_result_size = minimum_expected_result_size + 1 - for worker_id in range(num_workers): - reader = reader_class(max_instances=max_instances) - reader._set_worker_info(WorkerInfo(num_workers, worker_id)) - result = set( - x["index"].label for x in reader.read("the-path-doesnt-matter") # type: ignore - ) - results.append(result) - else: - reader = reader_class(max_instances=max_instances) - result = set( - x["index"].label for x in reader.read("the-path-doesnt-matter") # type: ignore + if node_rank is not None and world_size is not None: + monkeypatch.setattr(common_util, "is_distributed", lambda: True) + monkeypatch.setattr(dist, "get_rank", lambda: node_rank) + monkeypatch.setattr(dist, "get_world_size", lambda: world_size) + + if worker_id is not None and num_workers is not None: + monkeypatch.setattr( + dataset_reader, "get_worker_info", lambda: MockWorkerInfo(worker_id, num_workers) ) - results.append(result) - - # We need to check that all of the result sets are mutually exclusive and that they're - # union has size `max_instances`. - # Checking that they're mutually exclusive is equivalent to checking that the sum - # of the size of each set is equal to the size of the union. - - union: Set[int] = set() - total: int = 0 - for result in results: - union |= result - total += len(result) - # Also make sure the size of the set is within the expected bounds. - assert minimum_expected_result_size <= len(result) - assert len(result) <= maximum_expected_result_size - - assert len(union) == total == (max_instances or TOTAL_INSTANCES) + + reader = MockDatasetReader(max_instances=max_instances) + result = list((x["index"].label for x in reader.read("the-path-doesnt-matter"))) # type: ignore + + assert result == expected_result + + +class BadLazyReader(DatasetReader): + def _read(self, file_path): + return [self.text_to_instance(i) for i in range(10)] + + def text_to_instance(self, index: int): # type: ignore + return Instance({"index": LabelField(index, skip_indexing=True)}) + + +def test_config_error_when_lazy_reader_returns_list(): + reader = BadLazyReader(lazy=True) + with pytest.raises(ConfigurationError, match="must return a generator"): + deque(reader.read("path"), maxlen=0) + + +class BadReaderReadsNothing(DatasetReader): + def _read(self, file_path): + return [] + + def text_to_instance(self, index: int): # type: ignore + return Instance({"index": LabelField(index, skip_indexing=True)}) + + +def test_config_error_when_reader_returns_no_instances(): + reader = BadReaderReadsNothing() + with pytest.raises(ConfigurationError, match="No instances were read"): + deque(reader.read("path"), maxlen=0) + + +class BadReaderForgetsToSetLazy(DatasetReader): + def __init__(self): + pass + + def _read(self, file_path): + for i in range(10): + yield self.text_to_instance(i) + + def text_to_instance(self, index: int): # type: ignore + return Instance({"index": LabelField(index, skip_indexing=True)}) + + +def warning_when_reader_has_no_lazy_set(): + with pytest.warns(UserWarning, match="DatasetReader.lazy is not set"): + reader = BadReaderForgetsToSetLazy() + reader.read("path") diff --git a/tests/data/dataset_readers/dataset_utils/span_utils_test.py b/tests/data/dataset_readers/dataset_utils/span_utils_test.py index a4bf767a07e..e8714c71d17 100644 --- a/tests/data/dataset_readers/dataset_utils/span_utils_test.py +++ b/tests/data/dataset_readers/dataset_utils/span_utils_test.py @@ -3,7 +3,8 @@ from allennlp.common.testing import AllenNlpTestCase from allennlp.data.dataset_readers.dataset_utils import span_utils -from allennlp.data.tokenizers import Token, SpacyTokenizer +from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer +from allennlp.data.tokenizers.token import Token class SpanUtilsTest(AllenNlpTestCase): diff --git a/tests/data/dataset_readers/interleaving_dataset_reader_test.py b/tests/data/dataset_readers/interleaving_dataset_reader_test.py index 5e32138eae0..cdd7de2a3be 100644 --- a/tests/data/dataset_readers/interleaving_dataset_reader_test.py +++ b/tests/data/dataset_readers/interleaving_dataset_reader_test.py @@ -32,11 +32,11 @@ def test_round_robin(self): reader = InterleavingDatasetReader(readers) data_dir = self.FIXTURES_ROOT / "data" - file_path = { - "a": data_dir / "babi.txt", - "b": data_dir / "conll2003.txt", - "c": data_dir / "conll2003.txt", - } + file_path = f"""{{ + "a": "{data_dir / 'babi.txt'}", + "b": "{data_dir / 'conll2003.txt'}", + "c": "{data_dir / 'conll2003.txt'}" + }}""" instances = list(reader.read(file_path)) first_three_keys = {instance.fields["dataset"].metadata for instance in instances[:3]} diff --git a/tests/data/dataset_readers/lazy_dataset_reader_test.py b/tests/data/dataset_readers/lazy_dataset_reader_test.py new file mode 100644 index 00000000000..55ded98d6cf --- /dev/null +++ b/tests/data/dataset_readers/lazy_dataset_reader_test.py @@ -0,0 +1,62 @@ +from typing import Iterable, List + +from allennlp.data.fields import TextField +from allennlp.data.instance import Instance +from allennlp.data.dataset_readers import DatasetReader +from allennlp.data.token_indexers import SingleIdTokenIndexer +from allennlp.data.tokenizers import Token +from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.util import ensure_list + + +class LazyDatasetReader(DatasetReader): + def __init__(self, instances: List[Instance], lazy: bool) -> None: + super().__init__() + self.lazy = lazy + self._instances = instances + self.num_reads = 0 + + def _read(self, _: str) -> Iterable[Instance]: + self.num_reads += 1 + return (instance for instance in self._instances) + + +class TestLazyDatasetReader(AllenNlpTestCase): + def setup_method(self): + super().setup_method() + token_indexer = {"tokens": SingleIdTokenIndexer()} + + field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]], token_indexer) + field2 = TextField( + [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], token_indexer + ) + field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]], token_indexer) + field4 = TextField([Token(t) for t in ["this", "is", "short"]], token_indexer) + self.instances = [ + Instance({"text1": field1, "text2": field2}), + Instance({"text1": field3, "text2": field4}), + ] + + def test_lazy(self): + reader = LazyDatasetReader(self.instances, lazy=True) + assert reader.num_reads == 0 + + instances = reader.read("path/to/file") + + for _ in range(10): + _instances = (i for i in instances) + assert ensure_list(_instances) == self.instances + + assert reader.num_reads == 10 + + def test_non_lazy(self): + reader = LazyDatasetReader(self.instances, lazy=False) + assert reader.num_reads == 0 + + instances = reader.read("path/to/file") + + for _ in range(10): + _instances = (i for i in instances) + assert ensure_list(_instances) == self.instances + + assert reader.num_reads == 1 diff --git a/tests/data/dataset_readers/sequence_tagging_test.py b/tests/data/dataset_readers/sequence_tagging_test.py index 1da3fca977b..23ce6234456 100644 --- a/tests/data/dataset_readers/sequence_tagging_test.py +++ b/tests/data/dataset_readers/sequence_tagging_test.py @@ -1,13 +1,16 @@ +import pytest + from allennlp.data.dataset_readers import SequenceTaggingDatasetReader +from allennlp.common.util import ensure_list from allennlp.common.testing import AllenNlpTestCase class TestSequenceTaggingDatasetReader: - def test_default_format(self): - reader = SequenceTaggingDatasetReader() - instances = list( - reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") - ) + @pytest.mark.parametrize("lazy", (True, False)) + def test_default_format(self, lazy): + reader = SequenceTaggingDatasetReader(lazy=lazy) + instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") + instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields @@ -25,7 +28,8 @@ def test_default_format(self): def test_brown_corpus_format(self): reader = SequenceTaggingDatasetReader(word_tag_delimiter="/") - instances = list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "brown_corpus.txt")) + instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "brown_corpus.txt") + instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields diff --git a/tests/data/dataset_readers/sharded_dataset_reader_test.py b/tests/data/dataset_readers/sharded_dataset_reader_test.py index d1fa329ec28..80bff533c8b 100644 --- a/tests/data/dataset_readers/sharded_dataset_reader_test.py +++ b/tests/data/dataset_readers/sharded_dataset_reader_test.py @@ -1,13 +1,16 @@ +from collections import Counter import glob import os import tarfile -from collections import Counter from typing import Tuple +import pytest + from allennlp.common.testing import AllenNlpTestCase from allennlp.data.dataset_readers import ( SequenceTaggingDatasetReader, ShardedDatasetReader, + DatasetReader, ) from allennlp.data.instance import Instance @@ -22,12 +25,27 @@ def fingerprint(instance: Instance) -> Tuple[str, ...]: return text_tuple + labels_tuple +def test_exception_raised_when_base_reader_implements_sharding(): + class ManuallyShardedBaseReader(DatasetReader): + def __init__(self, **kwargs): + super().__init__(manual_distributed_sharding=True, **kwargs) + + def _read(self, file_path: str): + pass + + def text_to_instance(self, text: str): # type: ignore + pass + + with pytest.raises(ValueError, match="should not implement manual distributed sharding"): + ShardedDatasetReader(ManuallyShardedBaseReader()) + + class TestShardedDatasetReader(AllenNlpTestCase): def setup_method(self) -> None: super().setup_method() # use SequenceTaggingDatasetReader as the base reader - self.base_reader = SequenceTaggingDatasetReader() + self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" # Make 100 copies of the data diff --git a/tests/data/dataset_readers/text_classification_json_test.py b/tests/data/dataset_readers/text_classification_json_test.py index 88d72dc0b4b..4baf5f7c30b 100644 --- a/tests/data/dataset_readers/text_classification_json_test.py +++ b/tests/data/dataset_readers/text_classification_json_test.py @@ -2,21 +2,24 @@ from typing import List from allennlp.data.dataset_readers import TextClassificationJsonReader +from allennlp.common.util import ensure_list from allennlp.common.testing import AllenNlpTestCase from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter from allennlp.common.util import get_spacy_model class TestTextClassificationJsonReader: - def test_set_skip_indexing_true(self): - reader = TextClassificationJsonReader(skip_label_indexing=True) + @pytest.mark.parametrize("lazy", (True, False)) + def test_set_skip_indexing_true(self, lazy): + reader = TextClassificationJsonReader(lazy=lazy, skip_label_indexing=True) ag_path = ( AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "integer_labels.jsonl" ) - instances = list(reader.read(ag_path)) + instances = reader.read(ag_path) + instances = ensure_list(instances) instance1 = {"tokens": ["This", "text", "has", "label", "0"], "label": 0} instance2 = {"tokens": ["This", "text", "has", "label", "1"], "label": 1} @@ -36,18 +39,20 @@ def test_set_skip_indexing_true(self): / "text_classification_json" / "imdb_corpus.jsonl" ) - list(reader.read(ag_path)) + ensure_list(reader.read(ag_path)) assert str(exec_info.value) == "Labels must be integers if skip_label_indexing is True." - def test_read_from_file_ag_news_corpus(self): - reader = TextClassificationJsonReader() + @pytest.mark.parametrize("lazy", (True, False)) + def test_read_from_file_ag_news_corpus(self, lazy): + reader = TextClassificationJsonReader(lazy=lazy) ag_path = ( AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "ag_news_corpus.jsonl" ) - instances = list(reader.read(ag_path)) + instances = reader.read(ag_path) + instances = ensure_list(instances) instance1 = { "tokens": [ @@ -176,15 +181,17 @@ def test_read_from_file_ag_news_corpus(self): assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"] assert fields["label"].label == instance3["label"] - def test_read_from_file_ag_news_corpus_and_truncates_properly(self): - reader = TextClassificationJsonReader(max_sequence_length=5) + @pytest.mark.parametrize("lazy", (True, False)) + def test_read_from_file_ag_news_corpus_and_truncates_properly(self, lazy): + reader = TextClassificationJsonReader(lazy=lazy, max_sequence_length=5) ag_path = ( AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "ag_news_corpus.jsonl" ) - instances = list(reader.read(ag_path)) + instances = reader.read(ag_path) + instances = ensure_list(instances) instance1 = {"tokens": ["Memphis", "Rout", "Still", "Stings", "for"], "label": "2"} instance2 = {"tokens": ["AP", "-", "Eli", "Manning", "has"], "label": "2"} @@ -202,11 +209,12 @@ def test_read_from_file_ag_news_corpus_and_truncates_properly(self): assert fields["label"].label == instance3["label"] @pytest.mark.parametrize("max_sequence_length", (None, 5)) + @pytest.mark.parametrize("lazy", (True, False)) def test_read_from_file_ag_news_corpus_and_segments_sentences_properly( - self, max_sequence_length + self, lazy, max_sequence_length ): reader = TextClassificationJsonReader( - segment_sentences=True, max_sequence_length=max_sequence_length + lazy=lazy, segment_sentences=True, max_sequence_length=max_sequence_length ) ag_path = ( AllenNlpTestCase.FIXTURES_ROOT @@ -214,7 +222,8 @@ def test_read_from_file_ag_news_corpus_and_segments_sentences_properly( / "text_classification_json" / "ag_news_corpus.jsonl" ) - instances = list(reader.read(ag_path)) + instances = reader.read(ag_path) + instances = ensure_list(instances) splitter = SpacySentenceSplitter() spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False, False) diff --git a/tests/data/fields/array_field_test.py b/tests/data/fields/array_field_test.py new file mode 100644 index 00000000000..fbb0eb7da84 --- /dev/null +++ b/tests/data/fields/array_field_test.py @@ -0,0 +1,115 @@ +import numpy +import torch + +from allennlp.common.testing.test_case import AllenNlpTestCase +from allennlp.data.fields import ArrayField, ListField + + +class TestArrayField(AllenNlpTestCase): + def test_get_padding_lengths_correctly_returns_ordered_shape(self): + shape = [3, 4, 5, 6] + array = numpy.zeros(shape) + array_field = ArrayField(array) + lengths = array_field.get_padding_lengths() + for i in range(len(lengths)): + assert lengths["dimension_{}".format(i)] == shape[i] + + def test_as_tensor_handles_larger_padding_dimensions(self): + shape = [3, 4] + array = numpy.ones(shape) + array_field = ArrayField(array) + + padded_tensor = ( + array_field.as_tensor({"dimension_0": 5, "dimension_1": 6}).detach().cpu().numpy() + ) + numpy.testing.assert_array_equal(padded_tensor[:3, :4], array) + numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.0) + + def test_padding_handles_list_fields(self): + array1 = ArrayField(numpy.ones([2, 3])) + array2 = ArrayField(numpy.ones([1, 5])) + empty_array = array1.empty_field() + list_field = ListField([array1, array2, empty_array]) + + returned_tensor = ( + list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() + ) + correct_tensor = numpy.array( + [ + [[1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0]], + [[1.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0]], + ] + ) + numpy.testing.assert_array_equal(returned_tensor, correct_tensor) + + def test_padding_handles_list_fields_with_padding_values(self): + array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1) + array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1) + empty_array = array1.empty_field() + list_field = ListField([array1, array2, empty_array]) + + returned_tensor = ( + list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() + ) + correct_tensor = numpy.array( + [ + [[1.0, 1.0, 1.0, -1.0, -1.0], [1.0, 1.0, 1.0, -1.0, -1.0]], + [[1.0, 1.0, 1.0, 1.0, 1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]], + [[-1.0, -1.0, -1.0, -1.0, -1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]], + ] + ) + numpy.testing.assert_array_equal(returned_tensor, correct_tensor) + + def test_printing_doesnt_crash(self): + array = ArrayField(numpy.ones([2, 3]), padding_value=-1) + print(array) + + def test_as_tensor_works_with_scalar(self): + array = ArrayField(numpy.asarray(42)) + returned_tensor = array.as_tensor(array.get_padding_lengths()) + current_tensor = numpy.asarray(42) + numpy.testing.assert_array_equal(returned_tensor, current_tensor) + + def test_as_tensor_with_scalar_keeps_dtype(self): + array = ArrayField(numpy.asarray(42, dtype=numpy.float32)) + returned_tensor = array.as_tensor(array.get_padding_lengths()) + assert returned_tensor.dtype == torch.float32 + + def test_alternative_dtypes(self): + shape = [3, 4, 5, 6] + array = numpy.zeros(shape) + + # Setting dtype to numpy.int64 should produce a torch.LongTensor when field is converted to + # a tensor + array_field1 = ArrayField(array, dtype=numpy.int64) + returned_tensor1 = array_field1.as_tensor(array_field1.get_padding_lengths()) + assert returned_tensor1.dtype == torch.int64 + + # Setting dtype to numpy.uint8 should produce a torch.ByteTensor when field is converted to + # a tensor + array_field2 = ArrayField(array, dtype=numpy.uint8) + returned_tensor2 = array_field2.as_tensor(array_field2.get_padding_lengths()) + assert returned_tensor2.dtype == torch.uint8 + + # Padding should not affect dtype + padding_lengths = {"dimension_" + str(i): 10 for i, _ in enumerate(shape)} + padded_tensor = array_field2.as_tensor(padding_lengths) + assert padded_tensor.dtype == torch.uint8 + + # Empty fields should have the same dtype + empty_field = array_field2.empty_field() + assert empty_field.dtype == array_field2.dtype + + def test_len_works_with_scalar(self): + array = ArrayField(numpy.asarray(42)) + assert len(array) == 1 + + def test_eq(self): + array1 = ArrayField(numpy.asarray([1, 1, 1])) + array2 = ArrayField(numpy.asarray([[1, 1, 1], [1, 1, 1]])) + array3 = ArrayField(numpy.asarray([1, 1, 2])) + array4 = ArrayField(numpy.asarray([1, 1, 1])) + assert array1 != array2 + assert array1 != array3 + assert array1 == array4 diff --git a/tests/data/fields/list_field_test.py b/tests/data/fields/list_field_test.py index cdf2ad97d87..2356d9b3646 100644 --- a/tests/data/fields/list_field_test.py +++ b/tests/data/fields/list_field_test.py @@ -7,7 +7,8 @@ from allennlp.data import Token, Vocabulary, Instance from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer -from allennlp.data.data_loaders import SimpleDataLoader +from allennlp.data.dataloader import PyTorchDataLoader +from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset from allennlp.data.tokenizers import SpacyTokenizer from allennlp.models import Model from allennlp.modules import Embedding @@ -296,10 +297,11 @@ def test_empty_list_can_be_tensorized(self): instance.as_tensor_dict() def test_batch_with_some_empty_lists_works(self): - instances = [self.empty_instance, self.non_empty_instance] + dataset = AllennlpDataset([self.empty_instance, self.non_empty_instance], self.vocab) + model = DummyModel(self.vocab) model.eval() - loader = SimpleDataLoader(instances, 2, vocab=self.vocab) + loader = PyTorchDataLoader(dataset, batch_size=2) batch = next(iter(loader)) model.forward(**batch) @@ -310,10 +312,11 @@ def test_batch_with_some_empty_lists_works(self): # makes a whole lot more sense to just have a minimally-sized tensor that # gets entirely masked and has no effect on the rest of the model. def test_batch_of_entirely_empty_lists_works(self): - instances = [self.empty_instance, self.empty_instance] + dataset = AllennlpDataset([self.empty_instance, self.empty_instance], self.vocab) + model = DummyModel(self.vocab) model.eval() - loader = SimpleDataLoader(instances, 2, vocab=self.vocab) + loader = PyTorchDataLoader(dataset, batch_size=2) batch = next(iter(loader)) model.forward(**batch) diff --git a/tests/data/samplers/bucket_batch_sampler_test.py b/tests/data/samplers/bucket_batch_sampler_test.py index 3a972facdc2..dc71aa2efaa 100644 --- a/tests/data/samplers/bucket_batch_sampler_test.py +++ b/tests/data/samplers/bucket_batch_sampler_test.py @@ -1,18 +1,21 @@ from allennlp.common import Params -from allennlp.data import Instance, Token, Batch +from allennlp.data import Instance, Token +from allennlp.data.batch import Batch from allennlp.data.fields import TextField from allennlp.data.samplers import BucketBatchSampler -from allennlp.data.data_loaders import MultiProcessDataLoader +from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset +from allennlp.data.dataloader import PyTorchDataLoader from .sampler_test import SamplerTest class TestBucketSampler(SamplerTest): def test_create_batches_groups_correctly(self): - sampler = BucketBatchSampler(batch_size=2, padding_noise=0, sorting_keys=["text"]) + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"]) grouped_instances = [] - for indices in sampler.get_batch_indices(self.instances): + for indices in sampler: grouped_instances.append([self.instances[idx] for idx in indices]) expected_groups = [ [self.instances[4], self.instances[2]], @@ -25,7 +28,8 @@ def test_create_batches_groups_correctly(self): assert expected_groups == [] def test_guess_sorting_key_picks_the_longest_key(self): - sampler = BucketBatchSampler(batch_size=2, padding_noise=0) + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0) instances = [] short_tokens = [Token(t) for t in ["what", "is", "this", "?"]] long_tokens = [Token(t) for t in ["this", "is", "a", "not", "very", "long", "passage"]] @@ -58,12 +62,13 @@ def test_guess_sorting_key_picks_the_longest_key(self): assert sampler.sorting_keys == ["passage"] def test_from_params(self): + dataset = AllennlpDataset(self.instances, self.vocab) params = Params({}) sorting_keys = ["s1", "s2"] params["sorting_keys"] = sorting_keys params["batch_size"] = 32 - sampler = BucketBatchSampler.from_params(params=params) + sampler = BucketBatchSampler.from_params(params=params, data_source=dataset) assert sampler.sorting_keys == sorting_keys assert sampler.padding_noise == 0.1 @@ -78,33 +83,27 @@ def test_from_params(self): } ) - sampler = BucketBatchSampler.from_params(params=params) + sampler = BucketBatchSampler.from_params(params=params, data_source=dataset) assert sampler.sorting_keys == sorting_keys assert sampler.padding_noise == 0.5 assert sampler.batch_size == 100 assert sampler.drop_last def test_drop_last_works(self): + dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler( + dataset, batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) - # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. - def collate_fn(x, **kwargs): - return Batch(x) - - data_loader = MultiProcessDataLoader( - self.get_mock_reader(), - "fake_path", - batch_sampler=sampler, + dataloader = PyTorchDataLoader( + dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x) ) - data_loader.collate_fn = collate_fn - data_loader.index_with(self.vocab) - batches = [batch for batch in iter(data_loader)] + batches = [batch for batch in iter(dataloader)] stats = self.get_batches_stats(batches) # all batches have length batch_size @@ -114,21 +113,29 @@ def collate_fn(x, **kwargs): assert stats["total_instances"] == len(self.instances) - 1 def test_batch_count(self): - sampler = BucketBatchSampler(batch_size=2, padding_noise=0, sorting_keys=["text"]) - data_loader = MultiProcessDataLoader( - self.get_mock_reader(), "fake_path", batch_sampler=sampler + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"]) + # We use a custom collate_fn for testing, which doesn't actually create tensors, + # just the allennlp Batches. + dataloader = PyTorchDataLoader( + dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x) ) - data_loader.index_with(self.vocab) - assert len(data_loader) == 3 + + assert len(dataloader) == 3 def test_batch_count_with_drop_last(self): + dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler( + dataset, batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) - data_loader = MultiProcessDataLoader( - self.get_mock_reader(), "fake_path", batch_sampler=sampler + # We use a custom collate_fn for testing, which doesn't actually create tensors, + # just the allennlp Batches. + dataloader = PyTorchDataLoader( + dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x) ) - assert len(data_loader) == 2 + + assert len(dataloader) == 2 diff --git a/tests/data/samplers/max_tokens_batch_sampler_test.py b/tests/data/samplers/max_tokens_batch_sampler_test.py index a3b7e094733..04e5c87ca6c 100644 --- a/tests/data/samplers/max_tokens_batch_sampler_test.py +++ b/tests/data/samplers/max_tokens_batch_sampler_test.py @@ -1,17 +1,23 @@ +from allennlp.common import Params from allennlp.data import Instance, Token +from allennlp.data.batch import Batch from allennlp.data.fields import TextField from allennlp.data.samplers import MaxTokensBatchSampler -from allennlp.data.data_loaders import MultiProcessDataLoader +from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset +from allennlp.data.dataloader import PyTorchDataLoader from .sampler_test import SamplerTest class TestMaxTokensSampler(SamplerTest): def test_create_batches_groups_correctly(self): - sampler = MaxTokensBatchSampler(max_tokens=8, padding_noise=0, sorting_keys=["text"]) + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = MaxTokensBatchSampler( + dataset, max_tokens=8, padding_noise=0, sorting_keys=["text"] + ) grouped_instances = [] - for indices in sampler.get_batch_indices(self.instances): + for indices in sampler: grouped_instances.append([self.instances[idx] for idx in indices]) expected_groups = [ [self.instances[4], self.instances[2]], @@ -24,7 +30,8 @@ def test_create_batches_groups_correctly(self): assert expected_groups == [] def test_guess_sorting_key_picks_the_longest_key(self): - sampler = MaxTokensBatchSampler(max_tokens=8, padding_noise=0) + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = MaxTokensBatchSampler(dataset, max_tokens=8, padding_noise=0) instances = [] short_tokens = [Token(t) for t in ["what", "is", "this", "?"]] long_tokens = [Token(t) for t in ["this", "is", "a", "not", "very", "long", "passage"]] @@ -56,9 +63,35 @@ def test_guess_sorting_key_picks_the_longest_key(self): sampler._guess_sorting_keys(instances) assert sampler.sorting_keys == ["passage"] + def test_from_params(self): + dataset = AllennlpDataset(self.instances, self.vocab) + params = Params({}) + + sorting_keys = ["s1", "s2"] + params["sorting_keys"] = sorting_keys + params["max_tokens"] = 32 + sampler = MaxTokensBatchSampler.from_params(params=params, data_source=dataset) + + assert sampler.sorting_keys == sorting_keys + assert sampler.padding_noise == 0.1 + assert sampler.max_tokens == 32 + + params = Params({"sorting_keys": sorting_keys, "padding_noise": 0.5, "max_tokens": 100}) + + sampler = MaxTokensBatchSampler.from_params(params=params, data_source=dataset) + assert sampler.sorting_keys == sorting_keys + assert sampler.padding_noise == 0.5 + assert sampler.max_tokens == 100 + def test_batch_count(self): - sampler = MaxTokensBatchSampler(max_tokens=8, padding_noise=0, sorting_keys=["text"]) - data_loader = MultiProcessDataLoader( - self.get_mock_reader(), "fake_path", batch_sampler=sampler + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = MaxTokensBatchSampler( + dataset, max_tokens=8, padding_noise=0, sorting_keys=["text"] ) - assert len(data_loader) == 3 + # We use a custom collate_fn for testing, which doesn't actually create tensors, + # just the allennlp Batches. + dataloader = PyTorchDataLoader( + dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x) + ) + + assert len(dataloader) == 3 diff --git a/tests/data/samplers/sampler_test.py b/tests/data/samplers/sampler_test.py index 3be895f8657..e981d41ebec 100644 --- a/tests/data/samplers/sampler_test.py +++ b/tests/data/samplers/sampler_test.py @@ -1,7 +1,7 @@ from typing import List, Iterable, Dict, Union from allennlp.common.testing import AllenNlpTestCase -from allennlp.data import Vocabulary, Instance, Token, Batch, DatasetReader +from allennlp.data import Vocabulary, Instance, Token, Batch from allennlp.data.fields import TextField from allennlp.data.token_indexers import SingleIdTokenIndexer @@ -40,22 +40,9 @@ def setup_method(self): self.instances = instances self.lazy_instances = LazyIterable(instances) - def get_mock_reader(self) -> DatasetReader: - class MockReader(DatasetReader): - def __init__(self, instances, **kwargs): - super().__init__(**kwargs) - self.instances = instances - - def _read(self, file_path: str): - for instance in self.instances: - yield instance - - return MockReader(self.instances) - def create_instance(self, str_tokens: List[str]): tokens = [Token(t) for t in str_tokens] instance = Instance({"text": TextField(tokens, self.token_indexers)}) - instance.index_fields(self.vocab) return instance def create_instances_from_token_counts(self, token_counts: List[int]) -> List[Instance]: diff --git a/tests/data/token_indexers/pretrained_transformer_indexer_test.py b/tests/data/token_indexers/pretrained_transformer_indexer_test.py index d817af9b392..f15f6096a36 100644 --- a/tests/data/token_indexers/pretrained_transformer_indexer_test.py +++ b/tests/data/token_indexers/pretrained_transformer_indexer_test.py @@ -99,7 +99,7 @@ def test_transformers_vocab_sizes(self, model_name): def test_transformers_vocabs_added_correctly(self): namespace, model_name = "tags", "roberta-base" - tokenizer = cached_transformers.get_tokenizer(model_name, use_fast=False) + tokenizer = cached_transformers.get_tokenizer(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") diff --git a/tests/data/tokenizers/letters_digits_tokenizer_test.py b/tests/data/tokenizers/letters_digits_tokenizer_test.py index 10309a8355d..07673c07b35 100644 --- a/tests/data/tokenizers/letters_digits_tokenizer_test.py +++ b/tests/data/tokenizers/letters_digits_tokenizer_test.py @@ -1,5 +1,6 @@ from allennlp.common.testing import AllenNlpTestCase -from allennlp.data.tokenizers import Token, LettersDigitsTokenizer +from allennlp.data.tokenizers.letters_digits_tokenizer import LettersDigitsTokenizer +from allennlp.data.tokenizers.token import Token class TestLettersDigitsTokenizer(AllenNlpTestCase): diff --git a/tests/data/tokenizers/spacy_tokenizer_test.py b/tests/data/tokenizers/spacy_tokenizer_test.py index 87756ce2d4b..5f445453b69 100644 --- a/tests/data/tokenizers/spacy_tokenizer_test.py +++ b/tests/data/tokenizers/spacy_tokenizer_test.py @@ -1,7 +1,8 @@ import spacy from allennlp.common.testing import AllenNlpTestCase -from allennlp.data.tokenizers import Token, SpacyTokenizer +from allennlp.data.tokenizers.token import Token +from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer class TestSpacyTokenizer(AllenNlpTestCase): diff --git a/tests/data/vocabulary_test.py b/tests/data/vocabulary_test.py index 69b90c718f1..91d9c0e3021 100644 --- a/tests/data/vocabulary_test.py +++ b/tests/data/vocabulary_test.py @@ -873,19 +873,3 @@ def test_from_files_with_model_archive(self): vocab = Vocabulary.from_files(str(self.model_archive)) vocab.get_namespaces() == {"tokens", "labels"} assert vocab.get_token_from_index(3, namespace="tokens") == "u.n." - - -class TestVocabularyFromPretrainedTransformer(AllenNlpTestCase): - @pytest.mark.parametrize("model_name", ["bert-base-cased", "roberta-base"]) - def test_from_pretrained_transformer(self, model_name): - namespace = "tokens" - from allennlp.common import cached_transformers - - tokenizer = cached_transformers.get_tokenizer(model_name) - - vocab = Vocabulary.from_pretrained_transformer(model_name, namespace=namespace) - assert vocab._token_to_index[namespace] == tokenizer.get_vocab() - vocab.save_to_files(self.TEST_DIR / "vocab") - - vocab1 = Vocabulary.from_files(self.TEST_DIR / "vocab") - assert vocab1._token_to_index[namespace] == tokenizer.get_vocab() diff --git a/tests/models/archival_test.py b/tests/models/archival_test.py index 9e7ed2fee31..3b135536622 100644 --- a/tests/models/archival_test.py +++ b/tests/models/archival_test.py @@ -43,7 +43,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), @@ -90,7 +95,8 @@ def test_archive_model_uses_archive_path(self): train_model(self.params, serialization_dir=serialization_dir) # Use a new path. archive_model( - serialization_dir=serialization_dir, archive_path=serialization_dir / "new_path.tar.gz" + serialization_dir=serialization_dir, + archive_path=serialization_dir / "new_path.tar.gz", ) archive = load_archive(serialization_dir / "new_path.tar.gz") assert archive diff --git a/tests/modules/attention/scaled_dot_product_attention_test.py b/tests/modules/attention/scaled_dot_product_attention_test.py index 247cafc200d..6dd9314691b 100644 --- a/tests/modules/attention/scaled_dot_product_attention_test.py +++ b/tests/modules/attention/scaled_dot_product_attention_test.py @@ -5,7 +5,9 @@ from allennlp.common import Params from allennlp.common.testing.test_case import AllenNlpTestCase from allennlp.modules.attention.attention import Attention -from allennlp.modules.attention.scaled_dot_product_attention import ScaledDotProductAttention +from allennlp.modules.attention.scaled_dot_product_attention import ( + ScaledDotProductAttention, +) class TestScaledDotProductAttention(AllenNlpTestCase): diff --git a/tests/modules/elmo_test.py b/tests/modules/elmo_test.py index f885f6f39ec..77dfa797fa2 100644 --- a/tests/modules/elmo_test.py +++ b/tests/modules/elmo_test.py @@ -12,7 +12,8 @@ from allennlp.data.fields import TextField from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer -from allennlp.data.data_loaders import SimpleDataLoader +from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset +from allennlp.data.dataloader import PyTorchDataLoader from allennlp.modules.elmo import _ElmoBiLm, _ElmoCharacterEncoder, Elmo from allennlp.modules.token_embedders import ElmoTokenEmbedder from allennlp.nn.util import remove_sentence_boundaries @@ -99,9 +100,9 @@ def test_elmo_bilm(self): instances.append(instance) vocab = Vocabulary() + dataset = AllennlpDataset(instances, vocab) # Now finally we can iterate through batches. - loader = SimpleDataLoader(instances, 3) - loader.index_with(vocab) + loader = PyTorchDataLoader(dataset, 3) for i, batch in enumerate(loader): lm_embeddings = elmo_bilm(batch["elmo"]["character_ids"]["elmo_tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( diff --git a/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py b/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py index 4dcb3db98d8..6a6eabe48e6 100644 --- a/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py +++ b/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py @@ -32,7 +32,7 @@ def test_positional_embeddings(positional_encoding: Optional[str]): @pytest.mark.parametrize("positional_encoding", [None, "sinusoidal", "embedding"]) -def test_positional_encodings(positional_encoding: Optional[str]): +def test_mask_works(positional_encoding: Optional[str]): # All sizes are prime, making them easy to find during debugging. batch_size = 3 max_seq_len = 11 @@ -44,35 +44,28 @@ def test_positional_encodings(positional_encoding: Optional[str]): transformer.eval() with torch.no_grad(): - # We test this by running it twice, once with a shuffled sequence. The results should be the same if there - # is no positional encoding, and different otherwise. + # Construct inputs and masks inputs = torch.randn(batch_size, max_seq_len, dims) - mask = torch.ones(batch_size, max_seq_len, dtype=torch.bool) + all_ones_mask = torch.ones(batch_size, max_seq_len, dtype=torch.bool) + mask = all_ones_mask.clone() for b in range(batch_size): mask[b, max_seq_len - b :] = False - unshuffled_output = transformer(inputs, mask) + altered_inputs = inputs + (~mask).unsqueeze(2) * 10.0 - shuffle = torch.arange(0, max_seq_len).unsqueeze(0).expand_as(mask).clone() - for b in range(batch_size): - # Take care not to shuffle the masked values - perm = torch.randperm(max_seq_len - b) - shuffle[b, : max_seq_len - b] = shuffle[b, perm] - shuffle = shuffle.unsqueeze(2).expand_as(inputs) - shuffled_input = torch.gather(inputs, 1, shuffle) - shuffled_output = transformer(shuffled_input, mask) + # Make sure there is a difference without the mask + assert not torch.allclose( + transformer(inputs, all_ones_mask), transformer(altered_inputs, all_ones_mask) + ) - if positional_encoding is None: - assert torch.allclose( - torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-5 - ) - else: - assert not torch.allclose( - torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-5 - ) + # Make sure there is no difference with the mask + assert torch.allclose( + torch.masked_select(transformer(inputs, mask), mask.unsqueeze(2)), + torch.masked_select(transformer(altered_inputs, mask), mask.unsqueeze(2)), + ) @pytest.mark.parametrize("positional_encoding", [None, "sinusoidal", "embedding"]) -def test_mask_works(positional_encoding: Optional[str]): +def test_positional_encodings(positional_encoding: Optional[str]): # All sizes are prime, making them easy to find during debugging. batch_size = 3 max_seq_len = 11 @@ -84,21 +77,28 @@ def test_mask_works(positional_encoding: Optional[str]): transformer.eval() with torch.no_grad(): - # Construct inputs and masks + # We test this by running it twice, once with a shuffled sequence. The results should be the same if there + # is no positional encoding, and different otherwise. inputs = torch.randn(batch_size, max_seq_len, dims) - all_ones_mask = torch.ones(batch_size, max_seq_len, dtype=torch.bool) - mask = all_ones_mask.clone() + mask = torch.ones(batch_size, max_seq_len, dtype=torch.bool) for b in range(batch_size): mask[b, max_seq_len - b :] = False - altered_inputs = inputs + (~mask).unsqueeze(2) * 10.0 + unshuffled_output = transformer(inputs, mask) - # Make sure there is a difference without the mask - assert not torch.allclose( - transformer(inputs, all_ones_mask), transformer(altered_inputs, all_ones_mask) - ) + shuffle = torch.arange(0, max_seq_len).unsqueeze(0).expand_as(mask).clone() + for b in range(batch_size): + # Take care not to shuffle the masked values + perm = torch.randperm(max_seq_len - b) + shuffle[b, : max_seq_len - b] = shuffle[b, perm] + shuffle = shuffle.unsqueeze(2).expand_as(inputs) + shuffled_input = torch.gather(inputs, 1, shuffle) + shuffled_output = transformer(shuffled_input, mask) - # Make sure there is no difference with the mask - assert torch.allclose( - torch.masked_select(transformer(inputs, mask), mask.unsqueeze(2)), - torch.masked_select(transformer(altered_inputs, mask), mask.unsqueeze(2)), - ) + if positional_encoding is None: + assert torch.allclose( + torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-7 + ) + else: + assert not torch.allclose( + torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-7 + ) diff --git a/tests/modules/token_embedders/pretrained_transformer_embedder_test.py b/tests/modules/token_embedders/pretrained_transformer_embedder_test.py index 0b454c84db5..72233c944ee 100644 --- a/tests/modules/token_embedders/pretrained_transformer_embedder_test.py +++ b/tests/modules/token_embedders/pretrained_transformer_embedder_test.py @@ -315,18 +315,3 @@ def test_encoder_decoder_model(self): token_ids = torch.LongTensor([[1, 2, 3], [2, 3, 4]]) mask = torch.ones_like(token_ids).bool() token_embedder(token_ids, mask) - - def test_embeddings_resize(self): - regular_token_embedder = PretrainedTransformerEmbedder("bert-base-cased") - assert ( - regular_token_embedder.transformer_model.embeddings.word_embeddings.num_embeddings - == 28996 - ) - tokenizer_kwargs = {"additional_special_tokens": [""]} - enhanced_token_embedder = PretrainedTransformerEmbedder( - "bert-base-cased", tokenizer_kwargs=tokenizer_kwargs - ) - assert ( - enhanced_token_embedder.transformer_model.embeddings.word_embeddings.num_embeddings - == 28997 - ) diff --git a/tests/modules/transformer/toolkit_test.py b/tests/modules/transformer/toolkit_test.py index cd1bf60e9fd..df995d1f076 100644 --- a/tests/modules/transformer/toolkit_test.py +++ b/tests/modules/transformer/toolkit_test.py @@ -112,16 +112,24 @@ def forward( medium_layers = dict(medium.combined_transformer.layers.named_modules()) assert_equal_parameters( - medium_layers["0"], pretrained_layers["8"], TransformerStack._huggingface_mapping + medium_layers["0"], + pretrained_layers["8"], + TransformerStack._huggingface_mapping, ) assert_equal_parameters( - medium_layers["1"], pretrained_layers["9"], TransformerStack._huggingface_mapping + medium_layers["1"], + pretrained_layers["9"], + TransformerStack._huggingface_mapping, ) assert_equal_parameters( - medium_layers["2"], pretrained_layers["10"], TransformerStack._huggingface_mapping + medium_layers["2"], + pretrained_layers["10"], + TransformerStack._huggingface_mapping, ) assert_equal_parameters( - medium_layers["3"], pretrained_layers["11"], TransformerStack._huggingface_mapping + medium_layers["3"], + pretrained_layers["11"], + TransformerStack._huggingface_mapping, ) def test_combination_of_two_different_berts(self): diff --git a/tests/modules/transformer/transformer_embeddings_test.py b/tests/modules/transformer/transformer_embeddings_test.py index 08212ee15c9..9e267d3a9cf 100644 --- a/tests/modules/transformer/transformer_embeddings_test.py +++ b/tests/modules/transformer/transformer_embeddings_test.py @@ -124,7 +124,11 @@ def __init__( self.dropout = torch.nn.Dropout(dropout) def forward( - self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None + self, + input_ids=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, ): if input_ids is not None: input_shape = input_ids.size() @@ -168,7 +172,9 @@ def test_forward_runs_with_inputs(self): token_type_ids = torch.tensor([[1, 0]], dtype=torch.long) position_ids = torch.tensor([[0, 1]]) self.transformer_embeddings.forward( - input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, ) def test_output_size(self): @@ -180,7 +186,9 @@ def test_output_size(self): params = Params(params) module = TransformerEmbeddings.from_params(params) output = module.forward( - input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, ) assert output.shape[-1] == 7 @@ -224,13 +232,17 @@ def test_forward_against_huggingface_output(self, module_name, hf_module): torch.manual_seed(1234) embeddings = embeddings.eval() # setting to eval mode to avoid non-deterministic dropout. output = embeddings.forward( - input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, ) torch.manual_seed(1234) hf_module = hf_module.eval() # setting to eval mode to avoid non-deterministic dropout. hf_output = hf_module.forward( - input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, ) assert torch.allclose(output, hf_output) diff --git a/tests/nn/util_test.py b/tests/nn/util_test.py index d98439534ff..705f3f7ab74 100644 --- a/tests/nn/util_test.py +++ b/tests/nn/util_test.py @@ -1427,6 +1427,25 @@ def test_combine_tensors_and_multiply_with_batch_size_one_and_seq_len_one(self): assert_almost_equal(result.size(), [1, seq_len_1, seq_len_2]) + def test_has_tensor(self): + + has_tensor = util.has_tensor + tensor = torch.tensor([1, 2, 3]) + + assert has_tensor(["a", 10, tensor]) + assert not has_tensor(["a", 10]) + + assert has_tensor(("a", 10, tensor)) + assert not has_tensor(("a", 10)) + + assert has_tensor({"a": tensor, "b": 1}) + assert not has_tensor({"a": 10, "b": 1}) + + assert has_tensor(tensor) + assert not has_tensor(3) + + assert has_tensor({"x": [0, {"inside": {"double_inside": [3, [10, tensor]]}}]}) + def test_combine_initial_dims(self): tensor = torch.randn(4, 10, 20, 17, 5) @@ -1452,13 +1471,13 @@ def test_inspect_model_parameters(self): assert parameters_inspection_dict == util.inspect_parameters(model) def test_move_to_device(self): - # We're faking the tensor here so that we can test the calls to .to() without actually + # We're faking the tensor here so that we can test the calls to .cuda() without actually # needing a GPU. class FakeTensor(torch.Tensor): def __init__(self): self._device = None - def to(self, device, **kwargs): + def cuda(self, device): self._device = device return self diff --git a/tests/training/learning_rate_schedulers/slanted_triangular_test.py b/tests/training/learning_rate_schedulers/slanted_triangular_test.py index 5280970a34a..fadd7582186 100644 --- a/tests/training/learning_rate_schedulers/slanted_triangular_test.py +++ b/tests/training/learning_rate_schedulers/slanted_triangular_test.py @@ -5,10 +5,11 @@ import torch import pytest +from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset from allennlp.common import Lazy, Params from allennlp.common.checks import ConfigurationError from allennlp.common.testing import AllenNlpTestCase -from allennlp.data.data_loaders import SimpleDataLoader +from allennlp.data import PyTorchDataLoader from allennlp.training import Trainer from allennlp.training.learning_rate_schedulers import LearningRateScheduler, SlantedTriangular from allennlp.training.optimizers import Optimizer @@ -113,14 +114,14 @@ def test_from_params_in_trainer(self): ) # The method called in the logic below only checks the length of this list, not its # contents, so this should be safe. - instances = [1] * 40 + instances = AllennlpDataset([1] * 40) optim = self._get_optimizer() trainer = Trainer.from_params( model=self.model, optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, - data_loader=SimpleDataLoader(instances, batch_size=10), + data_loader=PyTorchDataLoader(instances, batch_size=10), ) assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular) @@ -150,7 +151,7 @@ def test_from_params_in_trainer(self): optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, - data_loader=SimpleDataLoader(instances, batch_size=10), + data_loader=PyTorchDataLoader(instances, batch_size=10), ) assert trainer._learning_rate_scheduler.num_epochs == 3 diff --git a/tests/training/optimizer_test.py b/tests/training/optimizer_test.py index b396cdcd4cc..1c330d5d718 100644 --- a/tests/training/optimizer_test.py +++ b/tests/training/optimizer_test.py @@ -20,7 +20,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, } ) self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params) @@ -90,10 +95,19 @@ def setup_method(self): { "text_field_embedder": { "token_embedders": { - "tokens": {"type": "embedding", "embedding_dim": 5, "sparse": True} + "tokens": { + "type": "embedding", + "embedding_dim": 5, + "sparse": True, + } } }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) diff --git a/tests/training/trainer_test.py b/tests/training/trainer_test.py index 443c3e852e0..be37ebc0e5c 100644 --- a/tests/training/trainer_test.py +++ b/tests/training/trainer_test.py @@ -16,7 +16,11 @@ from allennlp.common.params import Params from allennlp.common.testing import AllenNlpTestCase, requires_gpu, requires_multi_gpu from allennlp.data import Vocabulary -from allennlp.data.data_loaders import MultiProcessDataLoader, SimpleDataLoader, TensorDict +from allennlp.data.data_loaders import ( + MultiProcessDataLoader, + SimpleDataLoader, + TensorDict, +) from allennlp.data.dataset_readers import SequenceTaggingDatasetReader from allennlp.models.model import Model from allennlp.models.simple_tagger import SimpleTagger @@ -52,7 +56,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) @@ -559,7 +568,9 @@ def test_trainer_can_run_with_lr_scheduler(self): trainer.train() def test_trainer_sends_metric_to_lr_scheduler(self): - from allennlp.training.learning_rate_schedulers import ReduceOnPlateauLearningRateScheduler + from allennlp.training.learning_rate_schedulers import ( + ReduceOnPlateauLearningRateScheduler, + ) class RecordMetricLearningRateScheduler(ReduceOnPlateauLearningRateScheduler): def __init__(self, optimizer: Optimizer): @@ -960,7 +971,10 @@ def test_track_epoch_callback(self): def test_trainer_callback_is_called_everywhere(self): class FakeTrainerCallback(TrainerCallback): def on_start( - self, trainer: "GradientDescentTrainer", is_primary: bool = True, **kwargs + self, + trainer: "GradientDescentTrainer", + is_primary: bool = True, + **kwargs, ) -> None: if not hasattr(trainer, "start_callback_is_fired_first"): trainer.start_callback_is_fired_first = True # type: ignore @@ -1110,4 +1124,4 @@ def test_sparse_clip_grad(self): _ = clip_grad_norm_([embedding.weight], 1.5) # Final norm should be 1.5 grad = embedding.weight.grad.coalesce() - assert grad._values().norm(2.0).item() == pytest.approx(1.5, rel=1e-4) \ No newline at end of file + assert grad._values().norm(2.0).item() == pytest.approx(1.5, rel=1e-4) From e7b88252aa7344584635714301402b98b162279a Mon Sep 17 00:00:00 2001 From: Jacob Danovitch Date: Thu, 18 Feb 2021 14:04:17 -0500 Subject: [PATCH 20/20] checking in sparse attention --- allennlp/training/deepspeed/__init__.py | 1 + .../training/deepspeed/modules/__init__.py | 1 + .../deepspeed/modules/sparse_attention.py | 77 +++++++++++++++++++ .../deepspeed/modules/sparse_transformer.py | 52 +++++++++++++ setup.py | 2 +- 5 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 allennlp/training/deepspeed/modules/__init__.py create mode 100644 allennlp/training/deepspeed/modules/sparse_attention.py create mode 100644 allennlp/training/deepspeed/modules/sparse_transformer.py diff --git a/allennlp/training/deepspeed/__init__.py b/allennlp/training/deepspeed/__init__.py index e5f1e5c47e8..14fcb03b22e 100644 --- a/allennlp/training/deepspeed/__init__.py +++ b/allennlp/training/deepspeed/__init__.py @@ -1,2 +1,3 @@ from allennlp.training.deepspeed.trainer import DeepspeedTrainer from allennlp.training.deepspeed import optimizers +from allennlp.training.deepspeed import modules diff --git a/allennlp/training/deepspeed/modules/__init__.py b/allennlp/training/deepspeed/modules/__init__.py new file mode 100644 index 00000000000..ccb4f78dac4 --- /dev/null +++ b/allennlp/training/deepspeed/modules/__init__.py @@ -0,0 +1 @@ +from allennlp.training.deepspeed.modules.sparse_transformer import SparseTransformerEmbedder diff --git a/allennlp/training/deepspeed/modules/sparse_attention.py b/allennlp/training/deepspeed/modules/sparse_attention.py new file mode 100644 index 00000000000..184892fafa4 --- /dev/null +++ b/allennlp/training/deepspeed/modules/sparse_attention.py @@ -0,0 +1,77 @@ +from typing import Optional, Union +from overrides import overrides +from copy import deepcopy + +from allennlp.common import Registrable + +from transformers.models.bert.configuration_bert import BertConfig +from transformers.models.bert.modeling_bert import BertLayer + +from transformers.models.roberta.configuration_roberta import RobertaConfig +from transformers.models.roberta.modeling_roberta import RobertaLayer + +from deepspeed.ops.sparse_attention import ( + BertSparseSelfAttention, + SparsityConfig, + DenseSparsityConfig, + FixedSparsityConfig, + VariableSparsityConfig, + BigBirdSparsityConfig, + BSLongformerSparsityConfig, +) + +import torch +import warnings + + +class SparseSelfAttentionLayer(BertSparseSelfAttention): + @overrides + def forward( + self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor], *args, **kwargs + ): + extras = (*args, *kwargs.values()) + if not all(arg is None for arg in extras): + warnings.warn("SparseSelfAttentionLayer only accepts hidden_states and attention_mask.") + + return (super().forward(hidden_states, attention_mask),) + + +def replace_self_attention( + model: torch.nn.Module, + sparsity_config: SparsityConfig, + model_config: Union[BertConfig, RobertaConfig] = None, +): + # Largely follows these: + # https://github.com/microsoft/DeepSpeed/blob/c5b3f40e8481748f9658a19c2df1f17c5b579919/deepspeed/module_inject/inject.py#L6 + # https://github.com/microsoft/DeepSpeed/blob/c5b3f40e8481748f9658a19c2df1f17c5b579919/deepspeed/ops/sparse_attention/sparse_attention_utils.py#L85 + + config = model_config or model.config + assert isinstance( + config, (BertConfig, RobertaConfig) + ), "Only BERT and RoBERTa are currently supported by Deepspeed." + + for name, layer in model.named_children(): + if isinstance(layer, (BertLayer, RobertaLayer)): + deepspeed_sparse_self_attn = SparseSelfAttentionLayer(config, sparsity_config) + deepspeed_sparse_self_attn.query = layer.attention.self.query + deepspeed_sparse_self_attn.key = layer.attention.self.key + deepspeed_sparse_self_attn.value = layer.attention.self.value + + layer.attention.self = deepspeed_sparse_self_attn + setattr(model, name, deepcopy(layer)) + else: + replace_self_attention(layer, sparsity_config, model_config=config) + + return model + + +class _SparsityConfig(Registrable, SparsityConfig): + default_implementation = "base" + + +_SparsityConfig.register("base")(SparsityConfig) +_SparsityConfig.register("dense")(DenseSparsityConfig) +_SparsityConfig.register("fixed")(FixedSparsityConfig) +_SparsityConfig.register("variable")(VariableSparsityConfig) +_SparsityConfig.register("bigbird")(BigBirdSparsityConfig) +_SparsityConfig.register("longformer")(BSLongformerSparsityConfig) diff --git a/allennlp/training/deepspeed/modules/sparse_transformer.py b/allennlp/training/deepspeed/modules/sparse_transformer.py new file mode 100644 index 00000000000..4ed57ea3b47 --- /dev/null +++ b/allennlp/training/deepspeed/modules/sparse_transformer.py @@ -0,0 +1,52 @@ +from typing import Optional +from overrides import overrides + +from allennlp.modules.token_embedders.token_embedder import TokenEmbedder +from allennlp.modules.token_embedders.pretrained_transformer_embedder import PretrainedTransformerEmbedder +from deepspeed.ops.sparse_attention import SparseAttentionUtils + +from .sparse_attention import _SparsityConfig, replace_self_attention + +import torch + + +@TokenEmbedder.register("sparse_transformer") +class SparseTransformerEmbedder(PretrainedTransformerEmbedder): + def __init__( + self, + model_name: str, + sparsity_config: _SparsityConfig = _SparsityConfig(num_heads=4), + **kwargs + ): + super().__init__(model_name, **kwargs) + + self._sparsity_config = sparsity_config + self.transformer_model = replace_self_attention( + self.transformer_model, self._sparsity_config + ) + + @overrides + def forward( + self, + token_ids: torch.LongTensor, + mask: torch.BoolTensor, + type_ids: Optional[torch.LongTensor] = None, + segment_concat_mask: Optional[torch.BoolTensor] = None, + ) -> torch.Tensor: # type: ignore + + _, token_ids, mask, type_ids, *_ = SparseAttentionUtils.pad_to_block_size( + block_size=self._sparsity_config.block, + input_ids=token_ids, + attention_mask=mask, + token_type_ids=type_ids, + position_ids=None, + inputs_embeds=None, + pad_token_id=self.transformer_model.config.pad_token_id, + model_mbeddings=None, # typo is in function definition, not here + ) + return super().forward( + token_ids=token_ids, + mask=mask, + type_ids=type_ids, + segment_concat_mask=segment_concat_mask, + ) diff --git a/setup.py b/setup.py index 1026c144a27..8823174e662 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ "lmdb", "more-itertools", ], - extras_require={"deepspeed": ["deepspeed>=0.3.7"]}, + extras_require={"deepspeed": ["deepspeed>=0.3.7,<=0.3.8"]}, entry_points={"console_scripts": ["allennlp=allennlp.__main__:run"]}, include_package_data=True, python_requires=">=3.6.1",