diff --git a/CHANGELOG.md b/CHANGELOG.md index ccb68c84ec0..966eafd464c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +- Added `DeepspeedTrainer` and `FusedLambOptimizer`. + ### Fixed - Ensured that `MeanAbsoluteError` always returns a `float` metric value instead of a `Tensor`. diff --git a/allennlp/commands/__init__.py b/allennlp/commands/__init__.py index 3a0fba2232f..561f9137a19 100644 --- a/allennlp/commands/__init__.py +++ b/allennlp/commands/__init__.py @@ -50,7 +50,9 @@ def add_argument(self, *args, **kwargs): super().add_argument(*args, **kwargs) -def parse_args(prog: Optional[str] = None) -> Tuple[argparse.ArgumentParser, argparse.Namespace]: +def parse_args( + prog: Optional[str] = None, +) -> Tuple[argparse.ArgumentParser, argparse.Namespace]: """ Creates the argument parser for the main program and uses it to parse the args. """ diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py index d0b0692f857..63e4d23838e 100644 --- a/allennlp/commands/evaluate.py +++ b/allennlp/commands/evaluate.py @@ -27,17 +27,23 @@ class Evaluate(Subcommand): def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.ArgumentParser: description = """Evaluate the specified model + dataset""" subparser = parser.add_parser( - self.name, description=description, help="Evaluate the specified model + dataset." + self.name, + description=description, + help="Evaluate the specified model + dataset.", ) subparser.add_argument("archive_file", type=str, help="path to an archived trained model") subparser.add_argument( - "input_file", type=str, help="path to the file containing the evaluation data" + "input_file", + type=str, + help="path to the file containing the evaluation data", ) subparser.add_argument( - "--output-file", type=str, help="optional path to write the metrics to as JSON" + "--output-file", + type=str, + help="optional path to write the metrics to as JSON", ) subparser.add_argument( @@ -47,7 +53,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ) subparser.add_argument( - "--weights-file", type=str, help="a path that overrides which weights file to use" + "--weights-file", + type=str, + help="a path that overrides which weights file to use", ) cuda_device = subparser.add_mutually_exclusive_group(required=False) @@ -68,7 +76,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ) subparser.add_argument( - "--batch-size", type=int, help="If non-empty, the batch size to use during evaluation." + "--batch-size", + type=int, + help="If non-empty, the batch size to use during evaluation.", ) subparser.add_argument( diff --git a/allennlp/commands/find_learning_rate.py b/allennlp/commands/find_learning_rate.py index 8a1f6380ed4..853a408070c 100644 --- a/allennlp/commands/find_learning_rate.py +++ b/allennlp/commands/find_learning_rate.py @@ -39,7 +39,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ) subparser.add_argument( - "param_path", type=str, help="path to parameter file describing the model to be trained" + "param_path", + type=str, + help="path to parameter file describing the model to be trained", ) subparser.add_argument( "-s", @@ -60,10 +62,16 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ), ) subparser.add_argument( - "--start-lr", type=float, default=1e-5, help="learning rate to start the search" + "--start-lr", + type=float, + default=1e-5, + help="learning rate to start the search", ) subparser.add_argument( - "--end-lr", type=float, default=10, help="learning rate up to which search is done" + "--end-lr", + type=float, + default=10, + help="learning rate up to which search is done", ) subparser.add_argument( "--num-batches", diff --git a/allennlp/commands/predict.py b/allennlp/commands/predict.py index 24ac891b9a4..8742828d7f2 100644 --- a/allennlp/commands/predict.py +++ b/allennlp/commands/predict.py @@ -28,7 +28,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument description = """Run the specified model against a JSON-lines input file.""" subparser = parser.add_parser( - self.name, description=description, help="Use a trained model to make predictions." + self.name, + description=description, + help="Use a trained model to make predictions.", ) subparser.add_argument( @@ -38,12 +40,17 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument subparser.add_argument("--output-file", type=str, help="path to output file") subparser.add_argument( - "--weights-file", type=str, help="a path that overrides which weights file to use" + "--weights-file", + type=str, + help="a path that overrides which weights file to use", ) batch_size = subparser.add_mutually_exclusive_group(required=False) batch_size.add_argument( - "--batch-size", type=int, default=1, help="The batch size to use for processing" + "--batch-size", + type=int, + default=1, + help="The batch size to use for processing", ) subparser.add_argument( @@ -86,7 +93,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ) subparser.add_argument( - "--predictor", type=str, help="optionally specify a specific predictor to use" + "--predictor", + type=str, + help="optionally specify a specific predictor to use", ) subparser.add_argument( diff --git a/allennlp/commands/subcommand.py b/allennlp/commands/subcommand.py index 3efdef3e71e..ee327b22f33 100644 --- a/allennlp/commands/subcommand.py +++ b/allennlp/commands/subcommand.py @@ -39,7 +39,10 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument @classmethod @overrides def register( - cls: Type[T], name: str, constructor: Optional[str] = None, exist_ok: bool = False + cls: Type[T], + name: str, + constructor: Optional[str] = None, + exist_ok: bool = False, ) -> Callable[[Type[T]], Type[T]]: super_register_fn = super().register(name, constructor=constructor, exist_ok=exist_ok) diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py index 33d5df63acb..c32c792ed1b 100644 --- a/allennlp/commands/train.py +++ b/allennlp/commands/train.py @@ -24,7 +24,11 @@ from allennlp.common.plugins import import_plugins from allennlp.data import DatasetReader, Vocabulary from allennlp.data import DataLoader -from allennlp.models.archival import archive_model, CONFIG_NAME, verify_include_in_archive +from allennlp.models.archival import ( + archive_model, + CONFIG_NAME, + verify_include_in_archive, +) from allennlp.models.model import _DEFAULT_WEIGHTS, Model from allennlp.training.trainer import Trainer from allennlp.training import util as training_util @@ -40,7 +44,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument subparser = parser.add_parser(self.name, description=description, help="Train a model.") subparser.add_argument( - "param_path", type=str, help="path to parameter file describing the model to be trained" + "param_path", + type=str, + help="path to parameter file describing the model to be trained", ) subparser.add_argument( @@ -80,7 +86,10 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument ) subparser.add_argument( - "--node-rank", type=int, default=0, help="rank of this node in the distributed setup" + "--node-rank", + type=int, + default=0, + help="rank of this node in the distributed setup", ) subparser.add_argument( diff --git a/allennlp/common/cached_transformers.py b/allennlp/common/cached_transformers.py index e3e700af8a2..bbdf20ff9df 100644 --- a/allennlp/common/cached_transformers.py +++ b/allennlp/common/cached_transformers.py @@ -66,7 +66,9 @@ def strip_prefix(s): } if len(valid_keys) > 0: logger.info( - "Loading %d tensors from %s", len(valid_keys), override_weights_file + "Loading %d tensors from %s", + len(valid_keys), + override_weights_file, ) else: raise ValueError( diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py index b899a218718..82671fb901d 100644 --- a/allennlp/common/file_utils.py +++ b/allennlp/common/file_utils.py @@ -652,7 +652,10 @@ class CacheFile: """ def __init__( - self, cache_filename: Union[PathLike, str], mode: str = "w+b", suffix: str = ".tmp" + self, + cache_filename: Union[PathLike, str], + mode: str = "w+b", + suffix: str = ".tmp", ) -> None: self.cache_filename = ( cache_filename if isinstance(cache_filename, Path) else Path(cache_filename) @@ -671,7 +674,9 @@ def __exit__(self, exc_type, exc_value, traceback): if exc_value is None: # Success. logger.debug( - "Renaming temp file %s to cache at %s", self.temp_file.name, self.cache_filename + "Renaming temp file %s to cache at %s", + self.temp_file.name, + self.cache_filename, ) # Rename the temp file to the actual cache filename. os.replace(self.temp_file.name, self.cache_filename) @@ -922,7 +927,10 @@ def get_file_extension(path: str, dot=True, lower: bool = True): def open_compressed( - filename: Union[str, PathLike], mode: str = "rt", encoding: Optional[str] = "UTF-8", **kwargs + filename: Union[str, PathLike], + mode: str = "rt", + encoding: Optional[str] = "UTF-8", + **kwargs, ): if not isinstance(filename, str): filename = str(filename) diff --git a/allennlp/common/from_params.py b/allennlp/common/from_params.py index 6db2de629b2..be2787518f7 100644 --- a/allennlp/common/from_params.py +++ b/allennlp/common/from_params.py @@ -86,7 +86,9 @@ def is_base_registrable(cls) -> bool: Checks whether this is a class that directly inherits from Registrable, or is a subclass of such a class. """ - from allennlp.common.registrable import Registrable # import here to avoid circular imports + from allennlp.common.registrable import ( + Registrable, + ) # import here to avoid circular imports if not issubclass(cls, Registrable): return False @@ -148,7 +150,10 @@ def infer_params( else: super_parameters = {} - return {**super_parameters, **parameters} # Subclass parameters overwrite superclass ones + return { + **super_parameters, + **parameters, + } # Subclass parameters overwrite superclass ones def create_kwargs( @@ -245,7 +250,12 @@ def create_extras(cls: Type[T], extras: Dict[str, Any]) -> Dict[str, Any]: def pop_and_construct_arg( - class_name: str, argument_name: str, annotation: Type, default: Any, params: Params, **extras + class_name: str, + argument_name: str, + annotation: Type, + default: Any, + params: Params, + **extras, ) -> Any: """ Does the work of actually constructing an individual argument for @@ -261,7 +271,9 @@ def pop_and_construct_arg( `inspect.Parameter` object directly, so that we can handle `Union` types using recursion on this method, trying the different annotation types in the union in turn. """ - from allennlp.models.archival import load_archive # import here to avoid circular imports + from allennlp.models.archival import ( + load_archive, + ) # import here to avoid circular imports # We used `argument_name` as the method argument to avoid conflicts with 'name' being a key in # `extras`, which isn't _that_ unlikely. Now that we are inside the method, we can switch back @@ -536,7 +548,9 @@ def from_params( constructor (because you inspect `__init__`, but call `cls()`). """ - from allennlp.common.registrable import Registrable # import here to avoid circular imports + from allennlp.common.registrable import ( + Registrable, + ) # import here to avoid circular imports logger.debug( f"instantiating class {cls} from params {getattr(params, 'params', params)} " diff --git a/allennlp/common/plugins.py b/allennlp/common/plugins.py index e114631f3ab..45f7596ad08 100644 --- a/allennlp/common/plugins.py +++ b/allennlp/common/plugins.py @@ -33,14 +33,21 @@ The global plugins file will be found here. """ -DEFAULT_PLUGINS = ("allennlp_models", "allennlp_semparse", "allennlp_server") +DEFAULT_PLUGINS = ( + "allennlp_models", + "allennlp_semparse", + "allennlp_server", + "allennlp.training.deepspeed", +) """ Default plugins do not need to be declared in a plugins file. They will always be imported when they are installed in the current Python environment. """ -def discover_file_plugins(plugins_filename: str = LOCAL_PLUGINS_FILENAME) -> Iterable[str]: +def discover_file_plugins( + plugins_filename: str = LOCAL_PLUGINS_FILENAME, +) -> Iterable[str]: """ Returns an iterable of the plugins found, declared within a file whose path is `plugins_filename`. """ diff --git a/allennlp/common/testing/model_test_case.py b/allennlp/common/testing/model_test_case.py index d920152b2fc..5594493b480 100644 --- a/allennlp/common/testing/model_test_case.py +++ b/allennlp/common/testing/model_test_case.py @@ -54,7 +54,9 @@ def set_up_model( self.vocab = vocab self.instances = instances self.model = Model.from_params( - vocab=self.vocab, params=params["model"], serialization_dir=serialization_dir + vocab=self.vocab, + params=params["model"], + serialization_dir=serialization_dir, ) # TODO(joelgrus) get rid of these @@ -149,13 +151,17 @@ def ensure_model_can_train_save_and_load( print("Reading with original model") data_loader = DataLoader.from_params( - params=data_loader_params, reader=reader, data_path=params["validation_data_path"] + params=data_loader_params, + reader=reader, + data_path=params["validation_data_path"], ) data_loader.index_with(model.vocab) print("Reading with loaded model") data_loader2 = DataLoader.from_params( - params=data_loader_params2, reader=reader, data_path=params["validation_data_path"] + params=data_loader_params2, + reader=reader, + data_path=params["validation_data_path"], ) data_loader2.index_with(loaded_model.vocab) @@ -193,7 +199,10 @@ def ensure_model_can_train_save_and_load( # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal( - model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance + model_predictions[key], + loaded_model_predictions[key], + name=key, + tolerance=tolerance, ) # Check loaded model's loss exists and we can compute gradients, for continuing training. @@ -277,7 +286,10 @@ def assert_fields_equal(self, field1, field2, name: str, tolerance: float = 1e-6 assert field1.keys() == field2.keys() for key in field1: self.assert_fields_equal( - field1[key], field2[key], tolerance=tolerance, name=name + "." + str(key) + field1[key], + field2[key], + tolerance=tolerance, + name=name + "." + str(key), ) elif isinstance(field1, (list, tuple)): assert len(field1) == len(field2) diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py index 9f466e8ee6b..56291f70976 100644 --- a/allennlp/common/testing/test_case.py +++ b/allennlp/common/testing/test_case.py @@ -23,7 +23,8 @@ class AllenNlpTestCase: def setup_method(self): logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.DEBUG + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + level=logging.DEBUG, ) # Disabling some of the more verbose logging statements that typically aren't very helpful # in tests. diff --git a/allennlp/common/util.py b/allennlp/common/util.py index eddb7600ffd..de4ed6461c9 100644 --- a/allennlp/common/util.py +++ b/allennlp/common/util.py @@ -466,7 +466,10 @@ def int_to_device(device: Union[int, torch.device]) -> torch.device: def log_frozen_and_tunable_parameter_names(model: torch.nn.Module) -> None: - frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(model) + ( + frozen_parameter_names, + tunable_parameter_names, + ) = get_frozen_and_tunable_parameter_names(model) logger.info("The following parameters are Frozen (without gradient):") for name in frozen_parameter_names: diff --git a/allennlp/data/data_loaders/__init__.py b/allennlp/data/data_loaders/__init__.py index 8c2dfe8776c..ce94ed8dd69 100644 --- a/allennlp/data/data_loaders/__init__.py +++ b/allennlp/data/data_loaders/__init__.py @@ -1,4 +1,11 @@ -from allennlp.data.data_loaders.data_loader import DataLoader, TensorDict, allennlp_collate -from allennlp.data.data_loaders.multiprocess_data_loader import MultiProcessDataLoader, WorkerError +from allennlp.data.data_loaders.data_loader import ( + DataLoader, + TensorDict, + allennlp_collate, +) +from allennlp.data.data_loaders.multiprocess_data_loader import ( + MultiProcessDataLoader, + WorkerError, +) from allennlp.data.data_loaders.multitask_data_loader import MultiTaskDataLoader from allennlp.data.data_loaders.simple_data_loader import SimpleDataLoader diff --git a/allennlp/data/data_loaders/multiprocess_data_loader.py b/allennlp/data/data_loaders/multiprocess_data_loader.py index d0681bc0c78..e170592c0a5 100644 --- a/allennlp/data/data_loaders/multiprocess_data_loader.py +++ b/allennlp/data/data_loaders/multiprocess_data_loader.py @@ -12,7 +12,11 @@ from allennlp.common.util import lazy_groups_of, shuffle_iterable from allennlp.common.tqdm import Tqdm from allennlp.data.instance import Instance -from allennlp.data.data_loaders.data_loader import DataLoader, TensorDict, allennlp_collate +from allennlp.data.data_loaders.data_loader import ( + DataLoader, + TensorDict, + allennlp_collate, +) from allennlp.data.dataset_readers import DatasetReader, WorkerInfo, DatasetReaderInput from allennlp.data.fields import TextField from allennlp.data.samplers import BatchSampler diff --git a/allennlp/data/data_loaders/multitask_data_loader.py b/allennlp/data/data_loaders/multitask_data_loader.py index 222bd7d8324..9047f2d3efe 100644 --- a/allennlp/data/data_loaders/multitask_data_loader.py +++ b/allennlp/data/data_loaders/multitask_data_loader.py @@ -6,7 +6,10 @@ from overrides import overrides from allennlp.common import util -from allennlp.data.dataset_readers.dataset_reader import DatasetReader, DatasetReaderInput +from allennlp.data.dataset_readers.dataset_reader import ( + DatasetReader, + DatasetReaderInput, +) from allennlp.data.batch import Batch from allennlp.data.data_loaders.data_loader import DataLoader, TensorDict from allennlp.data.data_loaders.multiprocess_data_loader import MultiProcessDataLoader diff --git a/allennlp/data/data_loaders/multitask_scheduler.py b/allennlp/data/data_loaders/multitask_scheduler.py index f77d070f498..044ba57669f 100644 --- a/allennlp/data/data_loaders/multitask_scheduler.py +++ b/allennlp/data/data_loaders/multitask_scheduler.py @@ -71,7 +71,9 @@ def batch_instances( self, epoch_instances: Dict[str, Iterable[Instance]] ) -> Iterable[List[Instance]]: return _chunked_iterator( - more_itertools.roundrobin(*epoch_instances.values()), self.batch_size, self.drop_last + more_itertools.roundrobin(*epoch_instances.values()), + self.batch_size, + self.drop_last, ) def count_batches(self, dataset_counts: Dict[str, int]) -> int: diff --git a/allennlp/data/data_loaders/simple_data_loader.py b/allennlp/data/data_loaders/simple_data_loader.py index 26b66b30893..9c77021e16f 100644 --- a/allennlp/data/data_loaders/simple_data_loader.py +++ b/allennlp/data/data_loaders/simple_data_loader.py @@ -6,7 +6,11 @@ import torch from allennlp.common.util import lazy_groups_of -from allennlp.data.data_loaders.data_loader import DataLoader, allennlp_collate, TensorDict +from allennlp.data.data_loaders.data_loader import ( + DataLoader, + allennlp_collate, + TensorDict, +) from allennlp.data.dataset_readers import DatasetReader from allennlp.data.instance import Instance from allennlp.data.vocabulary import Vocabulary diff --git a/allennlp/data/dataset_readers/__init__.py b/allennlp/data/dataset_readers/__init__.py index 274d9d7e4ee..72f9ba3f9f3 100644 --- a/allennlp/data/dataset_readers/__init__.py +++ b/allennlp/data/dataset_readers/__init__.py @@ -14,8 +14,12 @@ ) from allennlp.data.dataset_readers.babi import BabiReader from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader -from allennlp.data.dataset_readers.interleaving_dataset_reader import InterleavingDatasetReader +from allennlp.data.dataset_readers.interleaving_dataset_reader import ( + InterleavingDatasetReader, +) from allennlp.data.dataset_readers.multitask import MultiTaskDatasetReader from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader from allennlp.data.dataset_readers.sharded_dataset_reader import ShardedDatasetReader -from allennlp.data.dataset_readers.text_classification_json import TextClassificationJsonReader +from allennlp.data.dataset_readers.text_classification_json import ( + TextClassificationJsonReader, +) diff --git a/allennlp/data/dataset_readers/conll2003.py b/allennlp/data/dataset_readers/conll2003.py index 19ca273c258..d2bbffe36d8 100644 --- a/allennlp/data/dataset_readers/conll2003.py +++ b/allennlp/data/dataset_readers/conll2003.py @@ -109,7 +109,9 @@ def __init__( convert_to_coding_scheme = coding_scheme super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + manual_distributed_sharding=True, + manual_multiprocess_sharding=True, + **kwargs, ) self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if tag_label is not None and tag_label not in self._VALID_LABELS: diff --git a/allennlp/data/dataset_readers/dataset_reader.py b/allennlp/data/dataset_readers/dataset_reader.py index a47178061a6..669915c01e6 100644 --- a/allennlp/data/dataset_readers/dataset_reader.py +++ b/allennlp/data/dataset_readers/dataset_reader.py @@ -383,7 +383,10 @@ def _multi_worker_islice( UserWarning, ) sharded_slice = itertools.islice( - sharded_slice, self._worker_info.id, None, self._worker_info.num_workers + sharded_slice, + self._worker_info.id, + None, + self._worker_info.num_workers, ) if max_instances is not None: diff --git a/allennlp/data/dataset_readers/dataset_utils/__init__.py b/allennlp/data/dataset_readers/dataset_utils/__init__.py index 4af41f46ce9..56d972e0c29 100644 --- a/allennlp/data/dataset_readers/dataset_utils/__init__.py +++ b/allennlp/data/dataset_readers/dataset_utils/__init__.py @@ -1,4 +1,7 @@ from allennlp.data.dataset_readers.dataset_utils.span_utils import enumerate_spans from allennlp.data.dataset_readers.dataset_utils.span_utils import bio_tags_to_spans -from allennlp.data.dataset_readers.dataset_utils.span_utils import to_bioul, iob1_to_bioul +from allennlp.data.dataset_readers.dataset_utils.span_utils import ( + to_bioul, + iob1_to_bioul, +) from allennlp.data.dataset_readers.dataset_utils.span_utils import bioul_tags_to_spans diff --git a/allennlp/data/dataset_readers/sequence_tagging.py b/allennlp/data/dataset_readers/sequence_tagging.py index 40f03a5d6de..0de82b82614 100644 --- a/allennlp/data/dataset_readers/sequence_tagging.py +++ b/allennlp/data/dataset_readers/sequence_tagging.py @@ -50,7 +50,9 @@ def __init__( **kwargs, ) -> None: super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + manual_distributed_sharding=True, + manual_multiprocess_sharding=True, + **kwargs, ) self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self._word_tag_delimiter = word_tag_delimiter diff --git a/allennlp/data/dataset_readers/sharded_dataset_reader.py b/allennlp/data/dataset_readers/sharded_dataset_reader.py index 2976bb332eb..0f1505d1d3c 100644 --- a/allennlp/data/dataset_readers/sharded_dataset_reader.py +++ b/allennlp/data/dataset_readers/sharded_dataset_reader.py @@ -38,7 +38,9 @@ class ShardedDatasetReader(DatasetReader): def __init__(self, base_reader: DatasetReader, **kwargs) -> None: super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + manual_distributed_sharding=True, + manual_multiprocess_sharding=True, + **kwargs, ) self.reader = base_reader # We have to make the base reader think that it's the only worker so that it doesn't diff --git a/allennlp/data/dataset_readers/text_classification_json.py b/allennlp/data/dataset_readers/text_classification_json.py index 81d1a80ebfc..5dea99685ea 100644 --- a/allennlp/data/dataset_readers/text_classification_json.py +++ b/allennlp/data/dataset_readers/text_classification_json.py @@ -56,7 +56,9 @@ def __init__( **kwargs, ) -> None: super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + manual_distributed_sharding=True, + manual_multiprocess_sharding=True, + **kwargs, ) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences diff --git a/allennlp/data/fields/adjacency_field.py b/allennlp/data/fields/adjacency_field.py index cf45cf8cf98..a6d2ffe6f28 100644 --- a/allennlp/data/fields/adjacency_field.py +++ b/allennlp/data/fields/adjacency_field.py @@ -135,7 +135,9 @@ def empty_field(self) -> "AdjacencyField": # The empty_list here is needed for mypy empty_list: List[Tuple[int, int]] = [] adjacency_field = AdjacencyField( - empty_list, self.sequence_field.empty_field(), padding_value=self._padding_value + empty_list, + self.sequence_field.empty_field(), + padding_value=self._padding_value, ) return adjacency_field diff --git a/allennlp/data/fields/field.py b/allennlp/data/fields/field.py index 40842293e26..a52fde6961b 100644 --- a/allennlp/data/fields/field.py +++ b/allennlp/data/fields/field.py @@ -6,7 +6,10 @@ from allennlp.data.vocabulary import Vocabulary DataArray = TypeVar( - "DataArray", torch.Tensor, Dict[str, torch.Tensor], Dict[str, Dict[str, torch.Tensor]] + "DataArray", + torch.Tensor, + Dict[str, torch.Tensor], + Dict[str, Dict[str, torch.Tensor]], ) diff --git a/allennlp/data/fields/label_field.py b/allennlp/data/fields/label_field.py index 06ebf47c579..bfc13e1145a 100644 --- a/allennlp/data/fields/label_field.py +++ b/allennlp/data/fields/label_field.py @@ -46,7 +46,10 @@ class LabelField(Field[torch.Tensor]): _already_warned_namespaces: Set[str] = set() def __init__( - self, label: Union[str, int], label_namespace: str = "labels", skip_indexing: bool = False + self, + label: Union[str, int], + label_namespace: str = "labels", + skip_indexing: bool = False, ) -> None: self.label = label self._label_namespace = label_namespace diff --git a/allennlp/data/fields/list_field.py b/allennlp/data/fields/list_field.py index 0a77a75a5d8..166d4f69b37 100644 --- a/allennlp/data/fields/list_field.py +++ b/allennlp/data/fields/list_field.py @@ -86,7 +86,9 @@ def sequence_length(self) -> int: @overrides def as_tensor(self, padding_lengths: Dict[str, int]) -> DataArray: padded_field_list = pad_sequence_to_length( - self.field_list, padding_lengths["num_fields"], self.field_list[0].empty_field + self.field_list, + padding_lengths["num_fields"], + self.field_list[0].empty_field, ) # Here we're removing the scoping on the padding length keys that we added in # `get_padding_lengths`; see the note there for more detail. diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py index 9d171223fd6..6997a6d0715 100644 --- a/allennlp/data/fields/text_field.py +++ b/allennlp/data/fields/text_field.py @@ -45,7 +45,9 @@ class TextField(SequenceField[TextFieldTensors]): __slots__ = ["tokens", "_token_indexers", "_indexed_tokens"] def __init__( - self, tokens: List[Token], token_indexers: Optional[Dict[str, TokenIndexer]] = None + self, + tokens: List[Token], + token_indexers: Optional[Dict[str, TokenIndexer]] = None, ) -> None: self.tokens = tokens self._token_indexers = token_indexers diff --git a/allennlp/data/image_loader.py b/allennlp/data/image_loader.py index f5f081763c6..cc7c5c1e724 100644 --- a/allennlp/data/image_loader.py +++ b/allennlp/data/image_loader.py @@ -69,7 +69,9 @@ def __call__(self, filename_or_filenames: Union[OnePath, ManyPaths]) -> ImagesWi size = cast( IntTensor, torch.tensor( - [image.shape[-2], image.shape[-1]], dtype=torch.int32, device=self.device + [image.shape[-2], image.shape[-1]], + dtype=torch.int32, + device=self.device, ), ) images.append(image) diff --git a/allennlp/data/token_indexers/__init__.py b/allennlp/data/token_indexers/__init__.py index 912c6bd57f5..07849db674f 100644 --- a/allennlp/data/token_indexers/__init__.py +++ b/allennlp/data/token_indexers/__init__.py @@ -7,7 +7,9 @@ from allennlp.data.token_indexers.token_indexer import TokenIndexer from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer from allennlp.data.token_indexers.spacy_indexer import SpacyTokenIndexer -from allennlp.data.token_indexers.pretrained_transformer_indexer import PretrainedTransformerIndexer +from allennlp.data.token_indexers.pretrained_transformer_indexer import ( + PretrainedTransformerIndexer, +) from allennlp.data.token_indexers.pretrained_transformer_mismatched_indexer import ( PretrainedTransformerMismatchedIndexer, ) diff --git a/allennlp/data/token_indexers/elmo_indexer.py b/allennlp/data/token_indexers/elmo_indexer.py index c5e6c37d910..5167a8814a7 100644 --- a/allennlp/data/token_indexers/elmo_indexer.py +++ b/allennlp/data/token_indexers/elmo_indexer.py @@ -153,7 +153,9 @@ def padding_token(): tensor_dict["elmo_tokens"] = torch.LongTensor( pad_sequence_to_length( - tokens["elmo_tokens"], padding_lengths["elmo_tokens"], default_value=padding_token + tokens["elmo_tokens"], + padding_lengths["elmo_tokens"], + default_value=padding_token, ) ) return tensor_dict diff --git a/allennlp/data/tokenizers/__init__.py b/allennlp/data/tokenizers/__init__.py index d2501600f81..aa19c0f5eb3 100644 --- a/allennlp/data/tokenizers/__init__.py +++ b/allennlp/data/tokenizers/__init__.py @@ -7,7 +7,9 @@ from allennlp.data.tokenizers.tokenizer import Tokenizer from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer from allennlp.data.tokenizers.letters_digits_tokenizer import LettersDigitsTokenizer -from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer +from allennlp.data.tokenizers.pretrained_transformer_tokenizer import ( + PretrainedTransformerTokenizer, +) from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer from allennlp.data.tokenizers.sentence_splitter import SentenceSplitter from allennlp.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py index eca12a1495b..89aa17e1da6 100644 --- a/allennlp/data/vocabulary.py +++ b/allennlp/data/vocabulary.py @@ -10,7 +10,17 @@ import re from collections import defaultdict from transformers import PreTrainedTokenizer -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Union, TYPE_CHECKING +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Set, + Union, + TYPE_CHECKING, +) from allennlp.common import Registrable from allennlp.common.file_utils import cached_path, FileLock diff --git a/allennlp/interpret/__init__.py b/allennlp/interpret/__init__.py index 3111d8ee6bf..b45c9b3fa83 100644 --- a/allennlp/interpret/__init__.py +++ b/allennlp/interpret/__init__.py @@ -1,2 +1,4 @@ from allennlp.interpret.attackers.attacker import Attacker -from allennlp.interpret.saliency_interpreters.saliency_interpreter import SaliencyInterpreter +from allennlp.interpret.saliency_interpreters.saliency_interpreter import ( + SaliencyInterpreter, +) diff --git a/allennlp/interpret/attackers/hotflip.py b/allennlp/interpret/attackers/hotflip.py index a9d15db7615..bd7cddf3f01 100644 --- a/allennlp/interpret/attackers/hotflip.py +++ b/allennlp/interpret/attackers/hotflip.py @@ -18,7 +18,17 @@ from allennlp.nn import util from allennlp.predictors.predictor import Predictor -DEFAULT_IGNORE_TOKENS = ["@@NULL@@", ".", ",", ";", "!", "?", "[MASK]", "[SEP]", "[CLS]"] +DEFAULT_IGNORE_TOKENS = [ + "@@NULL@@", + ".", + ",", + ";", + "!", + "?", + "[MASK]", + "[SEP]", + "[CLS]", +] @Attacker.register("hotflip") @@ -57,7 +67,10 @@ class Hotflip(Attacker): """ def __init__( - self, predictor: Predictor, vocab_namespace: str = "tokens", max_tokens: int = 5000 + self, + predictor: Predictor, + vocab_namespace: str = "tokens", + max_tokens: int = 5000, ) -> None: super().__init__(predictor) self.vocab = self.predictor._model.vocab @@ -230,7 +243,11 @@ def attack_from_json( final_outputs.append(outputs) return sanitize( - {"final": final_tokens, "original": original_tokens, "outputs": final_outputs} + { + "final": final_tokens, + "original": original_tokens, + "outputs": final_outputs, + } ) def attack_instance( diff --git a/allennlp/interpret/attackers/input_reduction.py b/allennlp/interpret/attackers/input_reduction.py index b098c858fb7..72702dd5e69 100644 --- a/allennlp/interpret/attackers/input_reduction.py +++ b/allennlp/interpret/attackers/input_reduction.py @@ -51,7 +51,11 @@ def attack_from_json( for instance in original_instances: final_tokens.append( self._attack_instance( - inputs, instance, input_field_to_attack, grad_input_field, ignore_tokens + inputs, + instance, + input_field_to_attack, + grad_input_field, + ignore_tokens, ) ) return sanitize({"final": final_tokens, "original": original_tokens}) diff --git a/allennlp/interpret/saliency_interpreters/__init__.py b/allennlp/interpret/saliency_interpreters/__init__.py index 1fc08d2ec9d..911bf97cdc8 100644 --- a/allennlp/interpret/saliency_interpreters/__init__.py +++ b/allennlp/interpret/saliency_interpreters/__init__.py @@ -1,4 +1,8 @@ -from allennlp.interpret.saliency_interpreters.saliency_interpreter import SaliencyInterpreter +from allennlp.interpret.saliency_interpreters.saliency_interpreter import ( + SaliencyInterpreter, +) from allennlp.interpret.saliency_interpreters.simple_gradient import SimpleGradient -from allennlp.interpret.saliency_interpreters.integrated_gradient import IntegratedGradient +from allennlp.interpret.saliency_interpreters.integrated_gradient import ( + IntegratedGradient, +) from allennlp.interpret.saliency_interpreters.smooth_gradient import SmoothGradient diff --git a/allennlp/interpret/saliency_interpreters/integrated_gradient.py b/allennlp/interpret/saliency_interpreters/integrated_gradient.py index 8b1f8f0af26..5d353a46bc6 100644 --- a/allennlp/interpret/saliency_interpreters/integrated_gradient.py +++ b/allennlp/interpret/saliency_interpreters/integrated_gradient.py @@ -6,7 +6,9 @@ from allennlp.common.util import JsonDict, sanitize from allennlp.data import Instance -from allennlp.interpret.saliency_interpreters.saliency_interpreter import SaliencyInterpreter +from allennlp.interpret.saliency_interpreters.saliency_interpreter import ( + SaliencyInterpreter, +) from allennlp.nn import util diff --git a/allennlp/interpret/saliency_interpreters/simple_gradient.py b/allennlp/interpret/saliency_interpreters/simple_gradient.py index 639da42e824..ffe5f622a43 100644 --- a/allennlp/interpret/saliency_interpreters/simple_gradient.py +++ b/allennlp/interpret/saliency_interpreters/simple_gradient.py @@ -5,7 +5,9 @@ import torch from allennlp.common.util import JsonDict, sanitize -from allennlp.interpret.saliency_interpreters.saliency_interpreter import SaliencyInterpreter +from allennlp.interpret.saliency_interpreters.saliency_interpreter import ( + SaliencyInterpreter, +) from allennlp.nn import util diff --git a/allennlp/interpret/saliency_interpreters/smooth_gradient.py b/allennlp/interpret/saliency_interpreters/smooth_gradient.py index 7f088fa4789..55688b47c2a 100644 --- a/allennlp/interpret/saliency_interpreters/smooth_gradient.py +++ b/allennlp/interpret/saliency_interpreters/smooth_gradient.py @@ -6,7 +6,9 @@ from allennlp.common.util import JsonDict, sanitize from allennlp.data import Instance -from allennlp.interpret.saliency_interpreters.saliency_interpreter import SaliencyInterpreter +from allennlp.interpret.saliency_interpreters.saliency_interpreter import ( + SaliencyInterpreter, +) from allennlp.predictors import Predictor diff --git a/allennlp/models/basic_classifier.py b/allennlp/models/basic_classifier.py index 26602ccca1f..11e02853eb8 100644 --- a/allennlp/models/basic_classifier.py +++ b/allennlp/models/basic_classifier.py @@ -5,7 +5,12 @@ from allennlp.data import TextFieldTensors, Vocabulary from allennlp.models.model import Model -from allennlp.modules import FeedForward, Seq2SeqEncoder, Seq2VecEncoder, TextFieldEmbedder +from allennlp.modules import ( + FeedForward, + Seq2SeqEncoder, + Seq2VecEncoder, + TextFieldEmbedder, +) from allennlp.nn import InitializerApplicator, util from allennlp.nn.util import get_text_field_mask from allennlp.training.metrics import CategoricalAccuracy diff --git a/allennlp/models/model.py b/allennlp/models/model.py index 68aff36f9ad..07604a1bda1 100644 --- a/allennlp/models/model.py +++ b/allennlp/models/model.py @@ -445,7 +445,9 @@ def from_archive(cls, archive_file: str, vocab: Vocabulary = None) -> "Model": If `vocab` is given, we will extend the loaded model's vocabulary using the passed vocab object (including calling `extend_embedder_vocab`, which extends embedding layers). """ - from allennlp.models.archival import load_archive # here to avoid circular imports + from allennlp.models.archival import ( + load_archive, + ) # here to avoid circular imports model = load_archive(archive_file).model if vocab: diff --git a/allennlp/modules/attention/__init__.py b/allennlp/modules/attention/__init__.py index ba9ba3ad021..a82a7c445af 100644 --- a/allennlp/modules/attention/__init__.py +++ b/allennlp/modules/attention/__init__.py @@ -4,4 +4,6 @@ from allennlp.modules.attention.cosine_attention import CosineAttention from allennlp.modules.attention.dot_product_attention import DotProductAttention from allennlp.modules.attention.linear_attention import LinearAttention -from allennlp.modules.attention.scaled_dot_product_attention import ScaledDotProductAttention +from allennlp.modules.attention.scaled_dot_product_attention import ( + ScaledDotProductAttention, +) diff --git a/allennlp/modules/attention/attention.py b/allennlp/modules/attention/attention.py index 1c525bf3bc4..9c2ba1d5c6b 100644 --- a/allennlp/modules/attention/attention.py +++ b/allennlp/modules/attention/attention.py @@ -40,7 +40,10 @@ def __init__(self, normalize: bool = True) -> None: @overrides def forward( - self, vector: torch.Tensor, matrix: torch.Tensor, matrix_mask: torch.BoolTensor = None + self, + vector: torch.Tensor, + matrix: torch.Tensor, + matrix_mask: torch.BoolTensor = None, ) -> torch.Tensor: similarities = self._forward_internal(vector, matrix) if self._normalize: diff --git a/allennlp/modules/augmented_lstm.py b/allennlp/modules/augmented_lstm.py index 93757aa6125..da55ffb6d30 100644 --- a/allennlp/modules/augmented_lstm.py +++ b/allennlp/modules/augmented_lstm.py @@ -38,7 +38,11 @@ class AugmentedLSTMCell(torch.nn.Module): """ def __init__( - self, embed_dim: int, lstm_dim: int, use_highway: bool = True, use_bias: bool = True + self, + embed_dim: int, + lstm_dim: int, + use_highway: bool = True, + use_bias: bool = True, ): super().__init__() self.embed_dim = embed_dim @@ -121,7 +125,13 @@ def forward( if self.use_highway: fused_op = projected_input[:, : 5 * self.lstm_dim] + projected_state fused_chunked = torch.chunk(fused_op, 5, 1) - (input_gate, forget_gate, memory_init, output_gate, highway_gate) = fused_chunked + ( + input_gate, + forget_gate, + memory_init, + output_gate, + highway_gate, + ) = fused_chunked highway_gate = torch.sigmoid(highway_gate) else: fused_op = projected_input + projected_state @@ -199,7 +209,9 @@ def __init__( ) def forward( - self, inputs: PackedSequence, states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + self, + inputs: PackedSequence, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> Tuple[PackedSequence, Tuple[torch.Tensor, torch.Tensor]]: """ Warning: Would be better to use the BiAugmentedLstm class in a regular model @@ -385,7 +397,9 @@ def __init__( self.representation_dim = lstm_embed_dim def forward( - self, inputs: torch.Tensor, states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + self, + inputs: torch.Tensor, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Given an input batch of sequential data such as word embeddings, produces @@ -423,7 +437,9 @@ def forward( return self._forward_unidirectional(inputs, states) def _forward_bidirectional( - self, inputs: PackedSequence, states: Optional[Tuple[torch.Tensor, torch.Tensor]] + self, + inputs: PackedSequence, + states: Optional[Tuple[torch.Tensor, torch.Tensor]], ): output_sequence = inputs final_h = [] @@ -439,7 +455,8 @@ def _forward_bidirectional( else: hidden_states = list( zip( # type: ignore - states[0].chunk(self.num_layers, 0), states[1].chunk(self.num_layers, 0) + states[0].chunk(self.num_layers, 0), + states[1].chunk(self.num_layers, 0), ) ) for i, state in enumerate(hidden_states): @@ -473,7 +490,9 @@ def _forward_bidirectional( return output_sequence, final_state_tuple def _forward_unidirectional( - self, inputs: PackedSequence, states: Optional[Tuple[torch.Tensor, torch.Tensor]] + self, + inputs: PackedSequence, + states: Optional[Tuple[torch.Tensor, torch.Tensor]], ): output_sequence = inputs final_h = [] @@ -489,7 +508,8 @@ def _forward_unidirectional( else: hidden_states = list( zip( # type: ignore - states[0].chunk(self.num_layers, 0), states[1].chunk(self.num_layers, 0) + states[0].chunk(self.num_layers, 0), + states[1].chunk(self.num_layers, 0), ) # type: ignore ) diff --git a/allennlp/modules/backbones/__init__.py b/allennlp/modules/backbones/__init__.py index 050d67fd2e1..2738ebcec6b 100644 --- a/allennlp/modules/backbones/__init__.py +++ b/allennlp/modules/backbones/__init__.py @@ -1,3 +1,5 @@ from allennlp.modules.backbones.backbone import Backbone -from allennlp.modules.backbones.pretrained_transformer_backbone import PretrainedTransformerBackbone +from allennlp.modules.backbones.pretrained_transformer_backbone import ( + PretrainedTransformerBackbone, +) from allennlp.modules.backbones.vilbert_backbone import VilbertBackbone diff --git a/allennlp/modules/backbones/vilbert_backbone.py b/allennlp/modules/backbones/vilbert_backbone.py index 99f790d1896..e9f18fdd339 100644 --- a/allennlp/modules/backbones/vilbert_backbone.py +++ b/allennlp/modules/backbones/vilbert_backbone.py @@ -7,7 +7,11 @@ from allennlp.data.fields.text_field import TextFieldTensors from allennlp.data.vocabulary import Vocabulary from allennlp.modules.backbones.backbone import Backbone -from allennlp.modules.transformer import BiModalEncoder, ImageFeatureEmbeddings, Embeddings +from allennlp.modules.transformer import ( + BiModalEncoder, + ImageFeatureEmbeddings, + Embeddings, +) logger = logging.getLogger(__name__) diff --git a/allennlp/modules/bimpm_matching.py b/allennlp/modules/bimpm_matching.py index 6d75ede67bf..b69dd0c71d4 100644 --- a/allennlp/modules/bimpm_matching.py +++ b/allennlp/modules/bimpm_matching.py @@ -170,7 +170,9 @@ def create_parameter(): # utility function to create and initialize a parameter torch.nn.init.kaiming_normal_(param) return param - def share_or_create(weights_to_share): # utility function to create or share the weights + def share_or_create( + weights_to_share, + ): # utility function to create or share the weights return weights_to_share if share_weights_between_directions else create_parameter() output_dim = ( @@ -322,10 +324,14 @@ def forward( matching_vector_max, mask_2.unsqueeze(-2).unsqueeze(-1), dim=2 ) matching_vector_2_max = masked_max( - matching_vector_max.permute(0, 2, 1, 3), mask_1.unsqueeze(-2).unsqueeze(-1), dim=2 + matching_vector_max.permute(0, 2, 1, 3), + mask_1.unsqueeze(-2).unsqueeze(-1), + dim=2, ) matching_vector_2_mean = masked_mean( - matching_vector_max.permute(0, 2, 1, 3), mask_1.unsqueeze(-2).unsqueeze(-1), dim=2 + matching_vector_max.permute(0, 2, 1, 3), + mask_1.unsqueeze(-2).unsqueeze(-1), + dim=2, ) matching_vector_1.extend([matching_vector_1_max, matching_vector_1_mean]) diff --git a/allennlp/modules/conditional_random_field.py b/allennlp/modules/conditional_random_field.py index c40e19359b7..7efe523bfe1 100644 --- a/allennlp/modules/conditional_random_field.py +++ b/allennlp/modules/conditional_random_field.py @@ -34,7 +34,10 @@ def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tu num_labels = len(labels) start_tag = num_labels end_tag = num_labels + 1 - labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")] + labels_with_boundaries = list(labels.items()) + [ + (start_tag, "START"), + (end_tag, "END"), + ] allowed = [] for from_label_index, from_label in labels_with_boundaries: diff --git a/allennlp/modules/elmo.py b/allennlp/modules/elmo.py index 1061a8fbdc4..4aca5e1040c 100644 --- a/allennlp/modules/elmo.py +++ b/allennlp/modules/elmo.py @@ -196,9 +196,10 @@ def forward( processed_representation = representation_with_bos_eos processed_mask = mask_with_bos_eos else: - representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries( - representation_with_bos_eos, mask_with_bos_eos - ) + ( + representation_without_bos_eos, + mask_without_bos_eos, + ) = remove_sentence_boundaries(representation_with_bos_eos, mask_with_bos_eos) processed_representation = representation_without_bos_eos processed_mask = mask_without_bos_eos representations.append(self._dropout(processed_representation)) @@ -336,14 +337,18 @@ def forward(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]: # Add BOS/EOS mask = (inputs > 0).sum(dim=-1) > 0 character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( - inputs, mask, self._beginning_of_sentence_characters, self._end_of_sentence_characters + inputs, + mask, + self._beginning_of_sentence_characters, + self._end_of_sentence_characters, ) # the character id embedding max_chars_per_token = self._options["char_cnn"]["max_characters_per_token"] # (batch_size * sequence_length, max_chars_per_token, embed_dim) character_embedding = torch.nn.functional.embedding( - character_ids_with_bos_eos.view(-1, max_chars_per_token), self._char_embedding_weights + character_ids_with_bos_eos.view(-1, max_chars_per_token), + self._char_embedding_weights, ) # run convolutions @@ -394,7 +399,8 @@ def _load_char_embedding(self): char_embed_weights = fin["char_embed"][...] weights = numpy.zeros( - (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), dtype="float32" + (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), + dtype="float32", ) weights[1:, :] = char_embed_weights @@ -410,7 +416,10 @@ def _load_cnn_weights(self): convolutions = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d( - in_channels=char_embed_dim, out_channels=num, kernel_size=width, bias=True + in_channels=char_embed_dim, + out_channels=num, + kernel_size=width, + bias=True, ) # load the weights with h5py.File(cached_path(self._weight_file), "r") as fin: @@ -583,7 +592,10 @@ def forward( embedded_inputs = self._word_embedding(word_inputs) # type: ignore # shape (batch_size, timesteps + 2, embedding_dim) type_representation, mask = add_sentence_boundary_token_ids( - embedded_inputs, mask_without_bos_eos, self._bos_embedding, self._eos_embedding + embedded_inputs, + mask_without_bos_eos, + self._bos_embedding, + self._eos_embedding, ) except (RuntimeError, IndexError): # Back off to running the character convolutions, diff --git a/allennlp/modules/elmo_lstm.py b/allennlp/modules/elmo_lstm.py index ca89a3fa571..9147b23a8dd 100644 --- a/allennlp/modules/elmo_lstm.py +++ b/allennlp/modules/elmo_lstm.py @@ -126,11 +126,18 @@ def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: where the num_layers dimension represents the LSTM output from that layer. """ batch_size, total_sequence_length = mask.size() - stacked_sequence_output, final_states, restoration_indices = self.sort_and_run_forward( - self._lstm_forward, inputs, mask - ) - - num_layers, num_valid, returned_timesteps, encoder_dim = stacked_sequence_output.size() + ( + stacked_sequence_output, + final_states, + restoration_indices, + ) = self.sort_and_run_forward(self._lstm_forward, inputs, mask) + + ( + num_layers, + num_valid, + returned_timesteps, + encoder_dim, + ) = stacked_sequence_output.size() # Add back invalid rows which were removed in the call to sort_and_run_forward. if num_valid < batch_size: zeros = stacked_sequence_output.new_zeros( diff --git a/allennlp/modules/matrix_attention/__init__.py b/allennlp/modules/matrix_attention/__init__.py index 4807383db9d..52fdf323d4c 100644 --- a/allennlp/modules/matrix_attention/__init__.py +++ b/allennlp/modules/matrix_attention/__init__.py @@ -1,5 +1,13 @@ from allennlp.modules.matrix_attention.matrix_attention import MatrixAttention -from allennlp.modules.matrix_attention.bilinear_matrix_attention import BilinearMatrixAttention -from allennlp.modules.matrix_attention.cosine_matrix_attention import CosineMatrixAttention -from allennlp.modules.matrix_attention.dot_product_matrix_attention import DotProductMatrixAttention -from allennlp.modules.matrix_attention.linear_matrix_attention import LinearMatrixAttention +from allennlp.modules.matrix_attention.bilinear_matrix_attention import ( + BilinearMatrixAttention, +) +from allennlp.modules.matrix_attention.cosine_matrix_attention import ( + CosineMatrixAttention, +) +from allennlp.modules.matrix_attention.dot_product_matrix_attention import ( + DotProductMatrixAttention, +) +from allennlp.modules.matrix_attention.linear_matrix_attention import ( + LinearMatrixAttention, +) diff --git a/allennlp/modules/matrix_attention/linear_matrix_attention.py b/allennlp/modules/matrix_attention/linear_matrix_attention.py index 1184b848198..a321a0154b1 100644 --- a/allennlp/modules/matrix_attention/linear_matrix_attention.py +++ b/allennlp/modules/matrix_attention/linear_matrix_attention.py @@ -70,6 +70,8 @@ def reset_parameters(self): @overrides def forward(self, matrix_1: torch.Tensor, matrix_2: torch.Tensor) -> torch.Tensor: combined_tensors = util.combine_tensors_and_multiply( - self._combination, [matrix_1.unsqueeze(2), matrix_2.unsqueeze(1)], self._weight_vector + self._combination, + [matrix_1.unsqueeze(2), matrix_2.unsqueeze(1)], + self._weight_vector, ) return self._activation(combined_tensors + self._bias) diff --git a/allennlp/modules/sampled_softmax_loss.py b/allennlp/modules/sampled_softmax_loss.py index 88e82975c25..6942db08fe3 100644 --- a/allennlp/modules/sampled_softmax_loss.py +++ b/allennlp/modules/sampled_softmax_loss.py @@ -163,7 +163,10 @@ def forward( return self._forward_train(embeddings, targets, target_token_embedding) def _forward_train( - self, embeddings: torch.Tensor, targets: torch.Tensor, target_token_embedding: torch.Tensor + self, + embeddings: torch.Tensor, + targets: torch.Tensor, + target_token_embedding: torch.Tensor, ) -> torch.Tensor: # (target_token_embedding is only used in the tie_embeddings case, diff --git a/allennlp/modules/scalar_mix.py b/allennlp/modules/scalar_mix.py index 4c003a8837a..8b5d069aa3d 100644 --- a/allennlp/modules/scalar_mix.py +++ b/allennlp/modules/scalar_mix.py @@ -38,7 +38,8 @@ def __init__( self.scalar_parameters = ParameterList( [ Parameter( - torch.FloatTensor([initial_scalar_parameters[i]]), requires_grad=trainable + torch.FloatTensor([initial_scalar_parameters[i]]), + requires_grad=trainable, ) for i in range(mixture_size) ] diff --git a/allennlp/modules/seq2seq_encoders/__init__.py b/allennlp/modules/seq2seq_encoders/__init__.py index 3cacdc80f9a..207effc1041 100644 --- a/allennlp/modules/seq2seq_encoders/__init__.py +++ b/allennlp/modules/seq2seq_encoders/__init__.py @@ -33,4 +33,6 @@ StackedBidirectionalLstmSeq2SeqEncoder, ) from allennlp.modules.seq2seq_encoders.seq2seq_encoder import Seq2SeqEncoder -from allennlp.modules.seq2seq_encoders.pytorch_transformer_wrapper import PytorchTransformer +from allennlp.modules.seq2seq_encoders.pytorch_transformer_wrapper import ( + PytorchTransformer, +) diff --git a/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py b/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py index 398255a2aa5..b648ef6d011 100644 --- a/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py +++ b/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py @@ -28,7 +28,12 @@ def __init__( if len(layer) == 2: # no dilation conv = torch.nn.Conv1d( - last_dim, layer[1] * 2, layer[0], stride=1, padding=layer[0] - 1, bias=True + last_dim, + layer[1] * 2, + layer[0], + stride=1, + padding=layer[0] - 1, + bias=True, ) elif len(layer) == 3: # a dilation diff --git a/allennlp/modules/seq2seq_encoders/pytorch_seq2seq_wrapper.py b/allennlp/modules/seq2seq_encoders/pytorch_seq2seq_wrapper.py index 3f36be0dedd..56ef1e543a6 100644 --- a/allennlp/modules/seq2seq_encoders/pytorch_seq2seq_wrapper.py +++ b/allennlp/modules/seq2seq_encoders/pytorch_seq2seq_wrapper.py @@ -69,7 +69,10 @@ def is_bidirectional(self) -> bool: @overrides def forward( - self, inputs: torch.Tensor, mask: torch.BoolTensor, hidden_state: torch.Tensor = None + self, + inputs: torch.Tensor, + mask: torch.BoolTensor, + hidden_state: torch.Tensor = None, ) -> torch.Tensor: if self.stateful and mask is None: @@ -82,9 +85,11 @@ def forward( batch_size, total_sequence_length = mask.size() - packed_sequence_output, final_states, restoration_indices = self.sort_and_run_forward( - self._module, inputs, mask, hidden_state - ) + ( + packed_sequence_output, + final_states, + restoration_indices, + ) = self.sort_and_run_forward(self._module, inputs, mask, hidden_state) unpacked_sequence_tensor, _ = pad_packed_sequence(packed_sequence_output, batch_first=True) @@ -116,7 +121,9 @@ def forward( sequence_length_difference = total_sequence_length - unpacked_sequence_tensor.size(1) if sequence_length_difference > 0: zeros = unpacked_sequence_tensor.new_zeros( - batch_size, sequence_length_difference, unpacked_sequence_tensor.size(-1) + batch_size, + sequence_length_difference, + unpacked_sequence_tensor.size(-1), ) unpacked_sequence_tensor = torch.cat([unpacked_sequence_tensor, zeros], 1) diff --git a/allennlp/modules/seq2vec_encoders/bert_pooler.py b/allennlp/modules/seq2vec_encoders/bert_pooler.py index 7509807f49f..0401193fc8b 100644 --- a/allennlp/modules/seq2vec_encoders/bert_pooler.py +++ b/allennlp/modules/seq2vec_encoders/bert_pooler.py @@ -77,7 +77,10 @@ def get_output_dim(self) -> int: return self._embedding_dim def forward( - self, tokens: torch.Tensor, mask: torch.BoolTensor = None, num_wrapping_dims: int = 0 + self, + tokens: torch.Tensor, + mask: torch.BoolTensor = None, + num_wrapping_dims: int = 0, ): pooler = self.pooler for _ in range(num_wrapping_dims): diff --git a/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py b/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py index 95d96912aa3..b53b3d9419a 100644 --- a/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py +++ b/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py @@ -67,7 +67,10 @@ def __init__( self._convolutions: List[torch.nn.Module] = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d( - in_channels=embedding_dim, out_channels=num, kernel_size=width, bias=True + in_channels=embedding_dim, + out_channels=num, + kernel_size=width, + bias=True, ) conv.weight.data.uniform_(-0.05, 0.05) conv.bias.data.fill_(0.0) diff --git a/allennlp/modules/seq2vec_encoders/pytorch_seq2vec_wrapper.py b/allennlp/modules/seq2vec_encoders/pytorch_seq2vec_wrapper.py index a52445fa8c1..de9e8cc2ee0 100644 --- a/allennlp/modules/seq2vec_encoders/pytorch_seq2vec_wrapper.py +++ b/allennlp/modules/seq2vec_encoders/pytorch_seq2vec_wrapper.py @@ -61,7 +61,10 @@ def get_output_dim(self) -> int: return self._module.hidden_size * (2 if is_bidirectional else 1) def forward( - self, inputs: torch.Tensor, mask: torch.BoolTensor, hidden_state: torch.Tensor = None + self, + inputs: torch.Tensor, + mask: torch.BoolTensor, + hidden_state: torch.Tensor = None, ) -> torch.Tensor: if mask is None: diff --git a/allennlp/modules/span_extractors/__init__.py b/allennlp/modules/span_extractors/__init__.py index 7a29d5aebdf..ba421ced7f8 100644 --- a/allennlp/modules/span_extractors/__init__.py +++ b/allennlp/modules/span_extractors/__init__.py @@ -1,5 +1,7 @@ from allennlp.modules.span_extractors.span_extractor import SpanExtractor -from allennlp.modules.span_extractors.endpoint_span_extractor import EndpointSpanExtractor +from allennlp.modules.span_extractors.endpoint_span_extractor import ( + EndpointSpanExtractor, +) from allennlp.modules.span_extractors.self_attentive_span_extractor import ( SelfAttentiveSpanExtractor, ) diff --git a/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py b/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py index e63a19b53aa..7545f03a7b0 100644 --- a/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py +++ b/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py @@ -96,7 +96,8 @@ def __init__( self._span_width_embedding: Optional[Embedding] = None if num_width_embeddings is not None and span_width_embedding_dim is not None: self._span_width_embedding = Embedding( - num_embeddings=num_width_embeddings, embedding_dim=span_width_embedding_dim + num_embeddings=num_width_embeddings, + embedding_dim=span_width_embedding_dim, ) elif num_width_embeddings is not None or span_width_embedding_dim is not None: raise ConfigurationError( @@ -229,11 +230,13 @@ def forward( # respective combinations and concatenate these representations. # Shape (batch_size, num_spans, forward_combination_dim) forward_spans = util.combine_tensors( - self._forward_combination, [forward_start_embeddings, forward_end_embeddings] + self._forward_combination, + [forward_start_embeddings, forward_end_embeddings], ) # Shape (batch_size, num_spans, backward_combination_dim) backward_spans = util.combine_tensors( - self._backward_combination, [backward_start_embeddings, backward_end_embeddings] + self._backward_combination, + [backward_start_embeddings, backward_end_embeddings], ) # Shape (batch_size, num_spans, forward_combination_dim + backward_combination_dim) span_embeddings = torch.cat([forward_spans, backward_spans], -1) diff --git a/allennlp/modules/span_extractors/endpoint_span_extractor.py b/allennlp/modules/span_extractors/endpoint_span_extractor.py index 86b19cb4a7e..f327566fade 100644 --- a/allennlp/modules/span_extractors/endpoint_span_extractor.py +++ b/allennlp/modules/span_extractors/endpoint_span_extractor.py @@ -74,7 +74,8 @@ def __init__( self._span_width_embedding: Optional[Embedding] = None if num_width_embeddings is not None and span_width_embedding_dim is not None: self._span_width_embedding = Embedding( - num_embeddings=num_width_embeddings, embedding_dim=span_width_embedding_dim + num_embeddings=num_width_embeddings, + embedding_dim=span_width_embedding_dim, ) elif num_width_embeddings is not None or span_width_embedding_dim is not None: raise ConfigurationError( diff --git a/allennlp/modules/text_field_embedders/__init__.py b/allennlp/modules/text_field_embedders/__init__.py index 9feb1eee972..75a8f982572 100644 --- a/allennlp/modules/text_field_embedders/__init__.py +++ b/allennlp/modules/text_field_embedders/__init__.py @@ -4,4 +4,6 @@ """ from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder -from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder +from allennlp.modules.text_field_embedders.basic_text_field_embedder import ( + BasicTextFieldEmbedder, +) diff --git a/allennlp/modules/token_embedders/__init__.py b/allennlp/modules/token_embedders/__init__.py index 8d1492ac362..d317930c1d2 100644 --- a/allennlp/modules/token_embedders/__init__.py +++ b/allennlp/modules/token_embedders/__init__.py @@ -5,13 +5,17 @@ from allennlp.modules.token_embedders.token_embedder import TokenEmbedder from allennlp.modules.token_embedders.embedding import Embedding -from allennlp.modules.token_embedders.token_characters_encoder import TokenCharactersEncoder +from allennlp.modules.token_embedders.token_characters_encoder import ( + TokenCharactersEncoder, +) from allennlp.modules.token_embedders.elmo_token_embedder import ElmoTokenEmbedder from allennlp.modules.token_embedders.empty_embedder import EmptyEmbedder from allennlp.modules.token_embedders.bag_of_word_counts_token_embedder import ( BagOfWordCountsTokenEmbedder, ) -from allennlp.modules.token_embedders.pass_through_token_embedder import PassThroughTokenEmbedder +from allennlp.modules.token_embedders.pass_through_token_embedder import ( + PassThroughTokenEmbedder, +) from allennlp.modules.token_embedders.pretrained_transformer_embedder import ( PretrainedTransformerEmbedder, ) diff --git a/allennlp/modules/token_embedders/embedding.py b/allennlp/modules/token_embedders/embedding.py index a68bdf8c4a1..a0796de710d 100644 --- a/allennlp/modules/token_embedders/embedding.py +++ b/allennlp/modules/token_embedders/embedding.py @@ -14,7 +14,11 @@ from allennlp.common import Tqdm from allennlp.common.checks import ConfigurationError -from allennlp.common.file_utils import cached_path, get_file_extension, is_url_or_existing_file +from allennlp.common.file_utils import ( + cached_path, + get_file_extension, + is_url_or_existing_file, +) from allennlp.data.vocabulary import Vocabulary from allennlp.modules.time_distributed import TimeDistributed from allennlp.modules.token_embedders.token_embedder import TokenEmbedder @@ -308,7 +312,10 @@ def extend_vocab( # It's easiest to just reload the embeddings for the entire vocab, # then only keep the ones we need. whole_weight = _read_pretrained_embeddings_file( - extension_pretrained_file, embedding_dim, extended_vocab, vocab_namespace + extension_pretrained_file, + embedding_dim, + extended_vocab, + vocab_namespace, ) extra_weight = whole_weight[self.num_embeddings :, :] @@ -441,18 +448,24 @@ def _read_embeddings_from_text_file( num_tokens_found += 1 else: logger.debug( - "Token %s was not found in the embedding file. Initialising randomly.", token + "Token %s was not found in the embedding file. Initialising randomly.", + token, ) logger.info( - "Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size + "Pretrained embeddings were found for %d out of %d tokens", + num_tokens_found, + vocab_size, ) return embedding_matrix def _read_embeddings_from_hdf5( - embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens" + embeddings_filename: str, + embedding_dim: int, + vocab: Vocabulary, + namespace: str = "tokens", ) -> torch.FloatTensor: """ Reads from a hdf5 formatted file. The embedding matrix is assumed to diff --git a/allennlp/modules/token_embedders/pretrained_transformer_embedder.py b/allennlp/modules/token_embedders/pretrained_transformer_embedder.py index 9903c310bd8..a102be910ee 100644 --- a/allennlp/modules/token_embedders/pretrained_transformer_embedder.py +++ b/allennlp/modules/token_embedders/pretrained_transformer_embedder.py @@ -194,7 +194,10 @@ def forward( # We call this with kwargs because some of the huggingface models don't have the # token_type_ids parameter and fail even when it's given as None. # Also, as of transformers v2.5.1, they are taking FloatTensor masks. - parameters = {"input_ids": token_ids, "attention_mask": transformer_mask.float()} + parameters = { + "input_ids": token_ids, + "attention_mask": transformer_mask.float(), + } if type_ids is not None: parameters["token_type_ids"] = type_ids @@ -214,7 +217,10 @@ def forward( if fold_long_sequences: embeddings = self._unfold_long_sequences( - embeddings, segment_concat_mask, batch_size, num_segment_concat_wordpieces + embeddings, + segment_concat_mask, + batch_size, + num_segment_concat_wordpieces, ) return embeddings @@ -264,7 +270,11 @@ def fold(tensor): # Shape: [batch_size, num_segment_concat_wordpieces] # Shape: [batch_size * num_segments, self._max_length] return tensor.reshape(-1, self._max_length) - return fold(token_ids), fold(mask), fold(type_ids) if type_ids is not None else None + return ( + fold(token_ids), + fold(mask), + fold(type_ids) if type_ids is not None else None, + ) def _unfold_long_sequences( self, @@ -338,7 +348,10 @@ def lengths_to_mask(lengths, max_len, device): embeddings = embeddings.reshape(batch_size, num_segments, self._max_length, embedding_size) embeddings = embeddings[ - :, :, self._num_added_start_tokens : embeddings.size(2) - self._num_added_end_tokens, : + :, + :, + self._num_added_start_tokens : embeddings.size(2) - self._num_added_end_tokens, + :, ] # truncate segment-level start/end tokens embeddings = embeddings.reshape(batch_size, -1, embedding_size) # flatten @@ -358,7 +371,9 @@ def lengths_to_mask(lengths, max_len, device): embeddings = torch.cat([embeddings, torch.zeros_like(end_token_embeddings)], 1) # Add end token embeddings back embeddings.scatter_( - 1, end_token_indices.unsqueeze(-1).expand_as(end_token_embeddings), end_token_embeddings + 1, + end_token_indices.unsqueeze(-1).expand_as(end_token_embeddings), + end_token_embeddings, ) # Now put back start tokens. We can do this before putting back end tokens, but then diff --git a/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py b/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py index 982ca1b1f46..e31f7f0fee4 100644 --- a/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py +++ b/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py @@ -3,7 +3,10 @@ from overrides import overrides import torch -from allennlp.modules.token_embedders import PretrainedTransformerEmbedder, TokenEmbedder +from allennlp.modules.token_embedders import ( + PretrainedTransformerEmbedder, + TokenEmbedder, +) from allennlp.nn import util @@ -105,7 +108,10 @@ def forward( """ # Shape: [batch_size, num_wordpieces, embedding_size]. embeddings = self._matched_embedder( - token_ids, wordpiece_mask, type_ids=type_ids, segment_concat_mask=segment_concat_mask + token_ids, + wordpiece_mask, + type_ids=type_ids, + segment_concat_mask=segment_concat_mask, ) # span_embeddings: (batch_size, num_orig_tokens, max_span_length, embedding_size) diff --git a/allennlp/modules/transformer/__init__.py b/allennlp/modules/transformer/__init__.py index f346ace8360..9650a999469 100644 --- a/allennlp/modules/transformer/__init__.py +++ b/allennlp/modules/transformer/__init__.py @@ -123,7 +123,9 @@ def forward(self, token_ids: torch.LongTensor, mask: torch.BoolTensor): ``` """ -from allennlp.modules.transformer.positional_encoding import SinusoidalPositionalEncoding +from allennlp.modules.transformer.positional_encoding import ( + SinusoidalPositionalEncoding, +) from allennlp.modules.transformer.transformer_module import TransformerModule from allennlp.modules.transformer.transformer_embeddings import ( @@ -133,7 +135,10 @@ def forward(self, token_ids: torch.LongTensor, mask: torch.BoolTensor): ) from allennlp.modules.transformer.self_attention import SelfAttention from allennlp.modules.transformer.activation_layer import ActivationLayer -from allennlp.modules.transformer.transformer_layer import AttentionLayer, TransformerLayer +from allennlp.modules.transformer.transformer_layer import ( + AttentionLayer, + TransformerLayer, +) from allennlp.modules.transformer.transformer_stack import TransformerStack from allennlp.modules.transformer.transformer_pooler import TransformerPooler from allennlp.modules.transformer.output_layer import OutputLayer diff --git a/allennlp/modules/transformer/bimodal_connection_layer.py b/allennlp/modules/transformer/bimodal_connection_layer.py index 5d7e4f7fc88..26834b97658 100644 --- a/allennlp/modules/transformer/bimodal_connection_layer.py +++ b/allennlp/modules/transformer/bimodal_connection_layer.py @@ -31,7 +31,10 @@ def forward(self, hidden_states1, input_tensor1, hidden_states2, input_tensor2): class BiModalConnectionLayer(TransformerModule, FromParams): - _huggingface_mapping = {"biAttention": "bimodal_attention", "biOutput": "bimodal_output"} + _huggingface_mapping = { + "biAttention": "bimodal_attention", + "biOutput": "bimodal_output", + } def __init__( self, diff --git a/allennlp/modules/transformer/self_attention.py b/allennlp/modules/transformer/self_attention.py index 6db6aba1fad..7ff7385759c 100644 --- a/allennlp/modules/transformer/self_attention.py +++ b/allennlp/modules/transformer/self_attention.py @@ -134,7 +134,10 @@ def forward( @classmethod def _get_mapping( - cls, pretrained_module=None, source="huggingface", mapping: Optional[Dict[str, str]] = None + cls, + pretrained_module=None, + source="huggingface", + mapping: Optional[Dict[str, str]] = None, ): combined_mapping = {} if "huggingface" in source: diff --git a/allennlp/modules/transformer/transformer_embeddings.py b/allennlp/modules/transformer/transformer_embeddings.py index df0e53c4544..6f9053c8388 100644 --- a/allennlp/modules/transformer/transformer_embeddings.py +++ b/allennlp/modules/transformer/transformer_embeddings.py @@ -71,7 +71,10 @@ def __init__(self, feature_size: int, embedding_size: int, dropout: float = 0.0) image_embeddings = torch.nn.Linear(feature_size, embedding_size) location_embeddings = torch.nn.Linear(4, embedding_size) embeddings = torch.nn.ModuleDict( - {"image_embeddings": image_embeddings, "location_embeddings": location_embeddings} + { + "image_embeddings": image_embeddings, + "location_embeddings": location_embeddings, + } ) super().__init__(embeddings, embedding_size, dropout) diff --git a/allennlp/modules/transformer/transformer_layer.py b/allennlp/modules/transformer/transformer_layer.py index 3282b2dbf14..70ec1ff56c1 100644 --- a/allennlp/modules/transformer/transformer_layer.py +++ b/allennlp/modules/transformer/transformer_layer.py @@ -158,10 +158,14 @@ def __init__( ) self.intermediate = ActivationLayer( - hidden_size=hidden_size, intermediate_size=intermediate_size, activation=activation + hidden_size=hidden_size, + intermediate_size=intermediate_size, + activation=activation, ) self.output = OutputLayer( - input_size=intermediate_size, hidden_size=hidden_size, dropout=hidden_dropout + input_size=intermediate_size, + hidden_size=hidden_size, + dropout=hidden_dropout, ) def forward( diff --git a/allennlp/modules/transformer/transformer_module.py b/allennlp/modules/transformer/transformer_module.py index 11b650d84ec..c2c1931ce19 100644 --- a/allennlp/modules/transformer/transformer_module.py +++ b/allennlp/modules/transformer/transformer_module.py @@ -48,7 +48,10 @@ def _get_mapping( @classmethod def _get_mapped_submodules( - cls, pretrained_module, source="huggingface", mapping: Optional[Dict[str, str]] = None + cls, + pretrained_module, + source="huggingface", + mapping: Optional[Dict[str, str]] = None, ): """ Subclasses overload this method, and provide appropriate name mapping based on the source. diff --git a/allennlp/modules/transformer/transformer_stack.py b/allennlp/modules/transformer/transformer_stack.py index edeefc27ba9..b475cce7358 100644 --- a/allennlp/modules/transformer/transformer_stack.py +++ b/allennlp/modules/transformer/transformer_stack.py @@ -124,7 +124,12 @@ def forward( return tuple( v - for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] + for v in [ + hidden_states, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] if v is not None ) diff --git a/allennlp/modules/transformer/util.py b/allennlp/modules/transformer/util.py index 33dfcf77859..80732dc21cf 100644 --- a/allennlp/modules/transformer/util.py +++ b/allennlp/modules/transformer/util.py @@ -3,7 +3,8 @@ def apply_mask( - values: torch.FloatTensor, mask: Union[torch.BoolTensor, torch.IntTensor, torch.FloatTensor] + values: torch.FloatTensor, + mask: Union[torch.BoolTensor, torch.IntTensor, torch.FloatTensor], ) -> torch.FloatTensor: """ # Parameters diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py index fff07b7dac2..dcc795550fc 100644 --- a/allennlp/nn/beam_search.py +++ b/allennlp/nn/beam_search.py @@ -68,7 +68,10 @@ class Sampler(Registrable): default_implementation = "deterministic" def init_state( - self, start_class_log_probabilities: torch.Tensor, batch_size: int, num_classes: int + self, + start_class_log_probabilities: torch.Tensor, + batch_size: int, + num_classes: int, ) -> StateType: return {} @@ -192,7 +195,9 @@ def sample_nodes( # NOTE: These indices are not indices into `log_probs`, they are indices into `top_k_log_probs`. # shape: (batch_size, per_node_beam_size) sampled_indices = torch.multinomial( - normalized_top_k_probs, per_node_beam_size, replacement=self.with_replacement + normalized_top_k_probs, + per_node_beam_size, + replacement=self.with_replacement, ) # Convert `sampled_indices` back to indices in the original `log_probs` tensor. @@ -279,7 +284,9 @@ def sample_nodes( # NOTE: These indices are not indices into `log_probs`, they are indices into `log_probs_descending`. # shape: (batch_size, per_node_beam_size) sampled_indices = torch.multinomial( - filtered_probabilities, per_node_beam_size, replacement=self.with_replacement + filtered_probabilities, + per_node_beam_size, + replacement=self.with_replacement, ) # Convert `sampled_indices` back to indices in the original `log_probs` tensor. @@ -311,7 +318,10 @@ def __init__(self, temperature: float = 1.0): @overrides def init_state( - self, start_class_log_probabilities: torch.Tensor, batch_size: int, num_classes: int + self, + start_class_log_probabilities: torch.Tensor, + batch_size: int, + num_classes: int, ) -> StateType: # shape: (batch_size, num_classes) zeros = start_class_log_probabilities.new_zeros((batch_size, num_classes)) @@ -400,7 +410,11 @@ def sample_beams( # shape: (batch_size * beam_size,) phi_S = selected_log_probs.reshape(batch_size * beam_size) - return selected_log_probs, selected_indices, {"G_phi_S": G_phi_S_new, "phi_S": phi_S} + return ( + selected_log_probs, + selected_indices, + {"G_phi_S": G_phi_S_new, "phi_S": phi_S}, + ) def gumbel(self, phi) -> torch.Tensor: """ @@ -579,7 +593,9 @@ def search( old_step = cast(StepFunctionTypeNoTimestep, step) def new_step( - last_predictions: torch.Tensor, state: Dict[str, torch.Tensor], time_step: int + last_predictions: torch.Tensor, + state: Dict[str, torch.Tensor], + time_step: int, ): return old_step(last_predictions, state) @@ -692,7 +708,7 @@ def _search( ) # shape (both): (batch_size * beam_size, per_node_beam_size) - top_log_probabilities, predicted_classes, sampler_state = self.sampler.sample_nodes( + (top_log_probabilities, predicted_classes, sampler_state,) = self.sampler.sample_nodes( cleaned_log_probabilities, self.per_node_beam_size, sampler_state ) diff --git a/allennlp/nn/chu_liu_edmonds.py b/allennlp/nn/chu_liu_edmonds.py index 74d9726ddcc..fa15d9e9265 100644 --- a/allennlp/nn/chu_liu_edmonds.py +++ b/allennlp/nn/chu_liu_edmonds.py @@ -69,7 +69,13 @@ def decode_mst( # The main algorithm operates inplace. chu_liu_edmonds( - length, score_matrix, current_nodes, final_edges, old_input, old_output, representatives + length, + score_matrix, + current_nodes, + final_edges, + old_input, + old_output, + representatives, ) heads = numpy.zeros([max_length], numpy.int32) @@ -224,7 +230,13 @@ def chu_liu_edmonds( representatives[cycle_representative].add(node) chu_liu_edmonds( - length, score_matrix, current_nodes, final_edges, old_input, old_output, representatives + length, + score_matrix, + current_nodes, + final_edges, + old_input, + old_output, + representatives, ) # Expansion stage. diff --git a/allennlp/nn/initializers.py b/allennlp/nn/initializers.py index 7d12dd9ba8a..c50aac28370 100644 --- a/allennlp/nn/initializers.py +++ b/allennlp/nn/initializers.py @@ -245,7 +245,10 @@ class KaimingUniformInitializer(_InitializerWrapper): def __init__(self, a: float = 0.0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"): super().__init__( - init_function=torch.nn.init.kaiming_uniform_, a=a, mode=mode, nonlinearity=nonlinearity + init_function=torch.nn.init.kaiming_uniform_, + a=a, + mode=mode, + nonlinearity=nonlinearity, ) @@ -257,7 +260,10 @@ class KaimingNormalInitializer(_InitializerWrapper): def __init__(self, a: float = 0.0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"): super().__init__( - init_function=torch.nn.init.kaiming_normal_, a=a, mode=mode, nonlinearity=nonlinearity + init_function=torch.nn.init.kaiming_normal_, + a=a, + mode=mode, + nonlinearity=nonlinearity, ) @@ -463,7 +469,9 @@ class InitializerApplicator(FromParams): """ def __init__( - self, regexes: List[Tuple[str, Initializer]] = None, prevent_regexes: List[str] = None + self, + regexes: List[Tuple[str, Initializer]] = None, + prevent_regexes: List[str] = None, ) -> None: self._initializers = regexes or [] self._prevent_regex = None diff --git a/allennlp/nn/util.py b/allennlp/nn/util.py index 9cc1c313156..31cf3335b25 100644 --- a/allennlp/nn/util.py +++ b/allennlp/nn/util.py @@ -172,7 +172,12 @@ def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor): # sequence lengths and returning the now sorted indices. _, reverse_mapping = permutation_index.sort(0, descending=False) restoration_indices = index_range.index_select(0, reverse_mapping) - return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index + return ( + sorted_tensor, + sorted_sequence_lengths, + restoration_indices, + permutation_index, + ) def get_final_encoder_states( @@ -784,7 +789,9 @@ def sequence_cross_entropy_with_logits( # shape : (2,) alpha_factor = torch.tensor( - [1.0 - float(alpha), float(alpha)], dtype=weights.dtype, device=weights.device + [1.0 - float(alpha), float(alpha)], + dtype=weights.dtype, + device=weights.device, ) elif isinstance(alpha, (list, numpy.ndarray, torch.Tensor)): @@ -1266,7 +1273,10 @@ def batched_index_select( def masked_index_fill( - target: torch.Tensor, indices: torch.LongTensor, mask: torch.BoolTensor, fill_value: int = 1 + target: torch.Tensor, + indices: torch.LongTensor, + mask: torch.BoolTensor, + fill_value: int = 1, ) -> torch.Tensor: """ The given `indices` in `target` will be will be filled with `fill_value` given a `mask`. @@ -1508,7 +1518,10 @@ def bucket_values( def add_sentence_boundary_token_ids( - tensor: torch.Tensor, mask: torch.BoolTensor, sentence_begin_token: Any, sentence_end_token: Any + tensor: torch.Tensor, + mask: torch.BoolTensor, + sentence_begin_token: Any, + sentence_end_token: Any, ) -> Tuple[torch.Tensor, torch.BoolTensor]: """ Add begin/end of sentence tokens to the batch of sentences. @@ -1742,7 +1755,9 @@ def find_text_field_embedder(model: torch.nn.Module) -> torch.nn.Module: first one, as it's very rare to have more than one. If there isn't a `TextFieldEmbedder` in the given `Model`, we raise a `ValueError`. """ - from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder + from allennlp.modules.text_field_embedders.text_field_embedder import ( + TextFieldEmbedder, + ) for module in model.modules(): if isinstance(module, TextFieldEmbedder): @@ -1764,7 +1779,9 @@ def find_embedding_layer(model: torch.nn.Module) -> torch.nn.Module: from transformers.models.bert.modeling_bert import BertEmbeddings from transformers.models.albert.modeling_albert import AlbertEmbeddings from transformers.models.roberta.modeling_roberta import RobertaEmbeddings - from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder + from allennlp.modules.text_field_embedders.text_field_embedder import ( + TextFieldEmbedder, + ) from allennlp.modules.text_field_embedders.basic_text_field_embedder import ( BasicTextFieldEmbedder, ) diff --git a/allennlp/predictors/sentence_tagger.py b/allennlp/predictors/sentence_tagger.py index 04d19e92916..e795e8147a8 100644 --- a/allennlp/predictors/sentence_tagger.py +++ b/allennlp/predictors/sentence_tagger.py @@ -23,7 +23,10 @@ class SentenceTaggerPredictor(Predictor): """ def __init__( - self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm" + self, + model: Model, + dataset_reader: DatasetReader, + language: str = "en_core_web_sm", ) -> None: super().__init__(model, dataset_reader) self._tokenizer = SpacyTokenizer(language=language, pos_tags=True) diff --git a/allennlp/tools/archive_surgery.py b/allennlp/tools/archive_surgery.py index fc1014d23fc..4cad447b0ae 100644 --- a/allennlp/tools/archive_surgery.py +++ b/allennlp/tools/archive_surgery.py @@ -46,7 +46,10 @@ def main(): help="overwrite the input file with the modified configuration", ) parser.add_argument( - "-f", "--force", action="store_true", help="overwrite the output file if it exists" + "-f", + "--force", + action="store_true", + help="overwrite the output file if it exists", ) args = parser.parse_args() diff --git a/allennlp/tools/create_elmo_embeddings_from_vocab.py b/allennlp/tools/create_elmo_embeddings_from_vocab.py index a0c3c6f4c77..47e3c372da7 100644 --- a/allennlp/tools/create_elmo_embeddings_from_vocab.py +++ b/allennlp/tools/create_elmo_embeddings_from_vocab.py @@ -108,13 +108,19 @@ def main( help="A path to a vocabulary file to generate representations for.", ) parser.add_argument( - "--elmo_config", type=str, help="The path to a directory containing an ELMo config file." + "--elmo_config", + type=str, + help="The path to a directory containing an ELMo config file.", ) parser.add_argument( - "--elmo_weights", type=str, help="The path to a directory containing an ELMo weight file." + "--elmo_weights", + type=str, + help="The path to a directory containing an ELMo weight file.", ) parser.add_argument( - "--output_dir", type=str, help="The output directory to store the serialised embeddings." + "--output_dir", + type=str, + help="The output directory to store the serialised embeddings.", ) parser.add_argument("--batch_size", type=int, default=64, help="The batch size to use.") parser.add_argument("--device", type=int, default=-1, help="The device to run on.") diff --git a/allennlp/training/checkpointer.py b/allennlp/training/checkpointer.py index 1cf9e8b8cf2..f98eff9ee78 100644 --- a/allennlp/training/checkpointer.py +++ b/allennlp/training/checkpointer.py @@ -63,7 +63,10 @@ def __init__( self._last_save_time = time.time() def maybe_save_checkpoint( - self, trainer: "allennlp.training.trainer.Trainer", epoch: int, batches_this_epoch: int + self, + trainer: "allennlp.training.trainer.Trainer", + epoch: int, + batches_this_epoch: int, ) -> None: """ Given amount of time lapsed between the last save and now (tracked internally), the @@ -76,6 +79,8 @@ def maybe_save_checkpoint( only looks at time, not batch or epoch number, though those parameters are available to you if you want to customize the behavior of this function. """ + if not trainer._primary: + return if self._model_save_interval is None: return if time.time() - self._last_save_time < self._model_save_interval: @@ -92,6 +97,9 @@ def save_checkpoint( is_best_so_far: bool = False, save_model_only=False, ) -> None: + if not trainer._primary: + return + if self._serialization_dir is not None: with trainer.get_checkpoint_state() as state: model_state, training_states = state diff --git a/allennlp/training/deepspeed/__init__.py b/allennlp/training/deepspeed/__init__.py new file mode 100644 index 00000000000..14fcb03b22e --- /dev/null +++ b/allennlp/training/deepspeed/__init__.py @@ -0,0 +1,3 @@ +from allennlp.training.deepspeed.trainer import DeepspeedTrainer +from allennlp.training.deepspeed import optimizers +from allennlp.training.deepspeed import modules diff --git a/allennlp/training/deepspeed/checkpointer.py b/allennlp/training/deepspeed/checkpointer.py new file mode 100644 index 00000000000..18adcd12007 --- /dev/null +++ b/allennlp/training/deepspeed/checkpointer.py @@ -0,0 +1,52 @@ +from typing import Union, Dict, Any, Tuple, Optional, Iterable + +import logging +import os +import shutil +from overrides import overrides + +from pathlib import Path + +import allennlp +from allennlp.training import Checkpointer + +logger = logging.getLogger(__name__) + + +class DeepspeedCheckpointer(Checkpointer): + @overrides + def save_checkpoint( + self, + epoch: Union[int, str], + trainer: "allennlp.training.deepspeed.trainer.DeepspeedTrainer", + is_best_so_far: bool = False, + save_model_only: bool = False, + ) -> None: + if self._serialization_dir is None: + return + + super().save_checkpoint(epoch, trainer, is_best_so_far, save_model_only) + + checkpoint_id = "deepspeed_epoch_{}".format(epoch) + trainer.model_engine.save_checkpoint(self._serialization_dir, checkpoint_id) + if trainer._primary and is_best_so_far: + engine_dir = os.path.join(self._serialization_dir, "best_deepspeed") + shutil.rmtree(engine_dir, ignore_errors=True) # in case no previous checkpoints + shutil.copytree(os.path.join(self._serialization_dir, checkpoint_id), engine_dir) + + def find_latest_deepspeed_checkpoint(self) -> Optional[str]: + checkpoints: Iterable[Path] = ( + self._serialization_dir and Path(self._serialization_dir).glob("deepspeed_epoch_*") + ) or [] + checkpoints = sorted(c for c in checkpoints if c.is_dir()) + if not checkpoints: + return None + + engine_path = str(checkpoints[-1]) + return engine_path + + @overrides + def restore_checkpoint(self) -> Tuple[str, Dict[str, Any], Dict[str, Any]]: + model_state, training_state = super().restore_checkpoint() + checkpoint_id = self.find_latest_deepspeed_checkpoint() + return checkpoint_id, model_state, training_state diff --git a/allennlp/training/deepspeed/config.py b/allennlp/training/deepspeed/config.py new file mode 100644 index 00000000000..f067febc473 --- /dev/null +++ b/allennlp/training/deepspeed/config.py @@ -0,0 +1,73 @@ +from typing import Dict, Any, Optional +from enum import IntEnum +from allennlp.common import FromParams +from dataclasses import dataclass, asdict + + +@dataclass +class DeepspeedFP16Config(FromParams): + enabled: bool = True + loss_scale: float = 0.0 + initial_scale_power: int = 32 + loss_scale_window: int = 1000 + hysteresis: int = 2 + min_loss_scale: float = 1.0 + + +@dataclass +class DeepspeedAMPConfig(FromParams): + enabled: bool = False + opt_level: str = "O1" + + +@dataclass +class DeepspeedOptimizerConfig(FromParams): + type: str + params: Dict[str, Any] + + +@dataclass +class DeepspeedLRSchedulerConfig(FromParams): + type: str + params: Dict[str, Any] + + +class DeepspeedZeROStage(IntEnum): + DISABLED = 0 + OPTIMIZER = 1 + GRADIENT = 2 + + +@dataclass +class DeepspeedZeROConfig(FromParams): + stage: DeepspeedZeROStage = DeepspeedZeROStage.GRADIENT + allgather_partitions: bool = True + allgather_bucket_size: int = 500000000 + overlap_comm: bool = False + reduce_scatter: bool = True + reduce_bucket_size: int = 500000000 + contiguous_gradients: bool = False + cpu_offload: bool = False + + +@dataclass +class DeepspeedConfig(FromParams): + zero_optimization: DeepspeedZeROConfig + fp16: DeepspeedFP16Config + amp: DeepspeedAMPConfig = DeepspeedAMPConfig() + optimizer: Optional[DeepspeedOptimizerConfig] = None + scheduler: Optional[DeepspeedLRSchedulerConfig] = None + + zero_allow_untested_optimizer: bool = True + wall_clock_breakdown: bool = False + + def to_dict(self): + return asdict(self) + + +@dataclass +class DeepspeedArgs(FromParams): + local_rank: int + deepspeed: bool = True + deepspeed_mpi: bool = False + deepspeed_config: Optional[str] = None diff --git a/allennlp/training/deepspeed/modules/__init__.py b/allennlp/training/deepspeed/modules/__init__.py new file mode 100644 index 00000000000..ccb4f78dac4 --- /dev/null +++ b/allennlp/training/deepspeed/modules/__init__.py @@ -0,0 +1 @@ +from allennlp.training.deepspeed.modules.sparse_transformer import SparseTransformerEmbedder diff --git a/allennlp/training/deepspeed/modules/sparse_attention.py b/allennlp/training/deepspeed/modules/sparse_attention.py new file mode 100644 index 00000000000..184892fafa4 --- /dev/null +++ b/allennlp/training/deepspeed/modules/sparse_attention.py @@ -0,0 +1,77 @@ +from typing import Optional, Union +from overrides import overrides +from copy import deepcopy + +from allennlp.common import Registrable + +from transformers.models.bert.configuration_bert import BertConfig +from transformers.models.bert.modeling_bert import BertLayer + +from transformers.models.roberta.configuration_roberta import RobertaConfig +from transformers.models.roberta.modeling_roberta import RobertaLayer + +from deepspeed.ops.sparse_attention import ( + BertSparseSelfAttention, + SparsityConfig, + DenseSparsityConfig, + FixedSparsityConfig, + VariableSparsityConfig, + BigBirdSparsityConfig, + BSLongformerSparsityConfig, +) + +import torch +import warnings + + +class SparseSelfAttentionLayer(BertSparseSelfAttention): + @overrides + def forward( + self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor], *args, **kwargs + ): + extras = (*args, *kwargs.values()) + if not all(arg is None for arg in extras): + warnings.warn("SparseSelfAttentionLayer only accepts hidden_states and attention_mask.") + + return (super().forward(hidden_states, attention_mask),) + + +def replace_self_attention( + model: torch.nn.Module, + sparsity_config: SparsityConfig, + model_config: Union[BertConfig, RobertaConfig] = None, +): + # Largely follows these: + # https://github.com/microsoft/DeepSpeed/blob/c5b3f40e8481748f9658a19c2df1f17c5b579919/deepspeed/module_inject/inject.py#L6 + # https://github.com/microsoft/DeepSpeed/blob/c5b3f40e8481748f9658a19c2df1f17c5b579919/deepspeed/ops/sparse_attention/sparse_attention_utils.py#L85 + + config = model_config or model.config + assert isinstance( + config, (BertConfig, RobertaConfig) + ), "Only BERT and RoBERTa are currently supported by Deepspeed." + + for name, layer in model.named_children(): + if isinstance(layer, (BertLayer, RobertaLayer)): + deepspeed_sparse_self_attn = SparseSelfAttentionLayer(config, sparsity_config) + deepspeed_sparse_self_attn.query = layer.attention.self.query + deepspeed_sparse_self_attn.key = layer.attention.self.key + deepspeed_sparse_self_attn.value = layer.attention.self.value + + layer.attention.self = deepspeed_sparse_self_attn + setattr(model, name, deepcopy(layer)) + else: + replace_self_attention(layer, sparsity_config, model_config=config) + + return model + + +class _SparsityConfig(Registrable, SparsityConfig): + default_implementation = "base" + + +_SparsityConfig.register("base")(SparsityConfig) +_SparsityConfig.register("dense")(DenseSparsityConfig) +_SparsityConfig.register("fixed")(FixedSparsityConfig) +_SparsityConfig.register("variable")(VariableSparsityConfig) +_SparsityConfig.register("bigbird")(BigBirdSparsityConfig) +_SparsityConfig.register("longformer")(BSLongformerSparsityConfig) diff --git a/allennlp/training/deepspeed/modules/sparse_transformer.py b/allennlp/training/deepspeed/modules/sparse_transformer.py new file mode 100644 index 00000000000..4ed57ea3b47 --- /dev/null +++ b/allennlp/training/deepspeed/modules/sparse_transformer.py @@ -0,0 +1,52 @@ +from typing import Optional +from overrides import overrides + +from allennlp.modules.token_embedders.token_embedder import TokenEmbedder +from allennlp.modules.token_embedders.pretrained_transformer_embedder import PretrainedTransformerEmbedder +from deepspeed.ops.sparse_attention import SparseAttentionUtils + +from .sparse_attention import _SparsityConfig, replace_self_attention + +import torch + + +@TokenEmbedder.register("sparse_transformer") +class SparseTransformerEmbedder(PretrainedTransformerEmbedder): + def __init__( + self, + model_name: str, + sparsity_config: _SparsityConfig = _SparsityConfig(num_heads=4), + **kwargs + ): + super().__init__(model_name, **kwargs) + + self._sparsity_config = sparsity_config + self.transformer_model = replace_self_attention( + self.transformer_model, self._sparsity_config + ) + + @overrides + def forward( + self, + token_ids: torch.LongTensor, + mask: torch.BoolTensor, + type_ids: Optional[torch.LongTensor] = None, + segment_concat_mask: Optional[torch.BoolTensor] = None, + ) -> torch.Tensor: # type: ignore + + _, token_ids, mask, type_ids, *_ = SparseAttentionUtils.pad_to_block_size( + block_size=self._sparsity_config.block, + input_ids=token_ids, + attention_mask=mask, + token_type_ids=type_ids, + position_ids=None, + inputs_embeds=None, + pad_token_id=self.transformer_model.config.pad_token_id, + model_mbeddings=None, # typo is in function definition, not here + ) + return super().forward( + token_ids=token_ids, + mask=mask, + type_ids=type_ids, + segment_concat_mask=segment_concat_mask, + ) diff --git a/allennlp/training/deepspeed/optimizers.py b/allennlp/training/deepspeed/optimizers.py new file mode 100644 index 00000000000..db13427235f --- /dev/null +++ b/allennlp/training/deepspeed/optimizers.py @@ -0,0 +1,35 @@ +from typing import List, Tuple, Dict, Any + +import torch + +from deepspeed.ops.lamb import FusedLamb +from allennlp.training.optimizers import Optimizer, make_parameter_groups + + +@Optimizer.register("fused_lamb") +class FusedLambOptimizer(Optimizer, FusedLamb): + def __init__( + self, + model_parameters: List[Tuple[str, torch.nn.Parameter]], + parameter_groups: List[Tuple[List[str], Dict[str, Any]]] = None, + lr: float = 0.001, + betas: Tuple[float, float] = (0.9, 0.999), + eps: float = 1e-08, + eps_inside_sqrt: bool = False, + weight_decay: float = 0.0, + amsgrad: bool = False, + max_grad_norm: float = 0.0, + max_coeff: float = 10.0, + min_coeff: float = 0.01, + ): + super().__init__( + params=make_parameter_groups(model_parameters, parameter_groups), + lr=lr, + betas=betas, + eps=eps, + weight_decay=weight_decay, + amsgrad=amsgrad, + max_grad_norm=max_grad_norm, + max_coeff=max_coeff, + min_coeff=min_coeff, + ) diff --git a/allennlp/training/deepspeed/trainer.py b/allennlp/training/deepspeed/trainer.py new file mode 100644 index 00000000000..601d057432a --- /dev/null +++ b/allennlp/training/deepspeed/trainer.py @@ -0,0 +1,415 @@ +import logging +import re +import time +from typing import Any, Dict, List, Optional, Union +from overrides import overrides + +import torch +import torch.distributed as dist + +from deepspeed.runtime.engine import DeepSpeedEngine +from deepspeed.utils import logger as ds_logger + +from allennlp.common import Lazy, Tqdm +from allennlp.common import util as common_util +from allennlp.common.checks import ConfigurationError +from allennlp.data import DataLoader, TensorDict +from allennlp.models.model import Model +from allennlp.nn import util as nn_util +from allennlp.training import util as training_util +from allennlp.training.checkpointer import Checkpointer +from allennlp.training.moving_average import MovingAverage +from allennlp.training.optimizers import Optimizer + +from allennlp.training.trainer import ( + Trainer, + GradientDescentTrainer, + TrainerCallback, +) + +from allennlp.training.deepspeed.config import DeepspeedConfig, DeepspeedArgs +from allennlp.training.deepspeed.checkpointer import DeepspeedCheckpointer + +logger = logging.getLogger(__name__) +ds_logger.setLevel(logging.WARNING) +ds_logger.propagate = False + + +@Trainer.register("deepspeed", constructor="from_partial_objects") +class DeepspeedTrainer(GradientDescentTrainer): + def __init__( + self, + model: Model, + data_loader: DataLoader, + deepspeed_engine: DeepSpeedEngine, + patience: Optional[int] = None, + validation_metric: str = "-loss", + validation_data_loader: DataLoader = None, + num_epochs: int = 20, + serialization_dir: Optional[str] = None, + checkpointer: Checkpointer = None, + cuda_device: Optional[Union[int, torch.device]] = None, + moving_average: Optional[MovingAverage] = None, + callbacks: List[TrainerCallback] = None, + distributed: bool = False, + local_rank: int = 0, + world_size: int = 1, + num_gradient_accumulation_steps: int = 1, + ) -> None: + super().__init__( + model=model, + optimizer=deepspeed_engine.optimizer, + data_loader=data_loader, + patience=patience, + validation_metric=validation_metric, + validation_data_loader=validation_data_loader, + num_epochs=num_epochs, + serialization_dir=serialization_dir, + cuda_device=cuda_device, + checkpointer=checkpointer, + moving_average=moving_average, + callbacks=callbacks, + distributed=False, # Avoid DDP init + local_rank=local_rank, + world_size=world_size, + num_gradient_accumulation_steps=num_gradient_accumulation_steps, + use_amp=False, + ) + + self.model_engine = deepspeed_engine + self._distributed = True + + if checkpointer is None and serialization_dir is not None: + self._checkpointer = DeepspeedCheckpointer(serialization_dir) + + def batch_outputs(self, batch: TensorDict, for_training: bool) -> Dict[str, torch.Tensor]: + """ + Does a forward pass on the given batch and returns the output dictionary that the model + returns, after adding any specified regularization penalty to the loss (if training). + """ + batch = nn_util.move_to_device(batch, self.model_engine.device) + output_dict = self.model_engine(**batch) + + if for_training: + try: + assert "loss" in output_dict + regularization_penalty = self.model.get_regularization_penalty() + + if regularization_penalty is not None: + output_dict["reg_loss"] = regularization_penalty + output_dict["loss"] += regularization_penalty + + except AssertionError: + if for_training: + raise RuntimeError( + "The model you are trying to optimize does not contain a" + " 'loss' key in the output of model.forward(inputs)." + ) + + return output_dict + + def _train_epoch(self, epoch: int) -> Dict[str, float]: + """ + Trains one epoch and returns metrics. + """ + logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) + cpu_memory_usage = [] + for worker, memory in common_util.peak_cpu_memory().items(): + cpu_memory_usage.append((worker, memory)) + logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}") + gpu_memory_usage = [] + for gpu, memory in common_util.peak_gpu_memory().items(): + gpu_memory_usage.append((gpu, memory)) + logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") + + regularization_penalty = self.model.get_regularization_penalty() + + train_loss = 0.0 + batch_loss = 0.0 + train_reg_loss = None if regularization_penalty is None else 0.0 + batch_reg_loss = None if regularization_penalty is None else 0.0 + + # Set the model to "train" mode. + self.model_engine.train() + + # Get tqdm for the training batches + batch_generator = iter(self.data_loader) + + logger.info("Training") + + num_training_batches: Union[int, float] + len_data_loader = len(self.data_loader) + num_training_batches = len_data_loader + + # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's + # progress is shown + if self._primary: + batch_generator_tqdm = Tqdm.tqdm(batch_generator, total=num_training_batches) + else: + batch_generator_tqdm = batch_generator + + self._last_log = time.time() + + batches_this_epoch = 0 + if self._batch_num_total is None: + self._batch_num_total = 0 + + for batch in batch_generator_tqdm: + batches_this_epoch += 1 + self._batch_num_total += 1 + batch_num_total = self._batch_num_total + + batch_outputs = self.batch_outputs(batch, for_training=True) + + loss = batch_outputs.get("loss") + reg_loss = batch_outputs.get("reg_loss") + if torch.isnan(loss): + raise ValueError("nan loss encountered") + + batch_loss = 0 if loss is None else loss.item() + train_loss += batch_loss + if reg_loss is not None: + batch_reg_loss = reg_loss.item() + train_reg_loss += batch_reg_loss # type: ignore + + self.model_engine.backward(loss) + self.model_engine.step() + + # Update moving averages + if self._moving_average is not None: + self._moving_average.apply(batch_num_total) + + # Update the description with the latest metrics + metrics = training_util.get_metrics( + self.model, + train_loss, + train_reg_loss, + batch_loss, + batch_reg_loss, + batches_this_epoch, + world_size=self._world_size, + cuda_device=self.cuda_device, + ) + + if self._primary: + # Updating tqdm only for the master as the trainers wouldn't have one + description = training_util.description_from_metrics(metrics) + batch_generator_tqdm.set_description(description, refresh=False) + + if self._checkpointer is not None: + self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) + + for callback in self._callbacks: + callback.on_batch( + self, + batch, + batch_outputs, + metrics, + epoch, + batches_this_epoch, + is_training=True, + is_primary=self._primary, + batch_grad_norm=None, # not yet implemented for DeepspeedTrainer + ) + + if self._distributed: + dist.barrier() + + metrics = training_util.get_metrics( + self.model, + train_loss, + train_reg_loss, + batch_loss=None, + batch_reg_loss=None, + num_batches=batches_this_epoch, + reset=True, + world_size=self._world_size, + cuda_device=self.cuda_device, + ) + + for (worker, memory) in cpu_memory_usage: + metrics["worker_" + str(worker) + "_memory_MB"] = memory / (1024 * 1024) + for (gpu_num, memory) in gpu_memory_usage: + metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) + return metrics + + def _restore_checkpoint(self) -> int: + """ + Restores the model and training state from the last saved checkpoint. + This includes an epoch count and optimizer state, which is serialized separately + from model parameters. This function should only be used to continue training - + if you wish to load a model for inference/load parts of a model into a new + computation graph, you should use the native Pytorch functions: + ` model.load_state_dict(torch.load("/path/to/model/weights.th"))` + If `self._serialization_dir` does not exist or does not contain any checkpointed weights, + this function will do nothing and return 0. + # Returns + epoch: `int` + The epoch at which to resume training, which should be one after the epoch + in the saved training state. + """ + if self._checkpointer is None: + return 0 + + self._checkpointer: DeepspeedCheckpointer + ( + checkpoint_id, + model_state, + training_state, + ) = self._checkpointer.restore_checkpoint() + + if not training_state: + # No checkpoint to restore, start at 0 + return 0 + + self.model.load_state_dict(model_state) + self.model_engine.load_checkpoint(self._serialization_dir, checkpoint_id) + + # Currently the `training_state` contains a serialized `MetricTracker`. + if "metric_tracker" in training_state: + self._metric_tracker.load_state_dict(training_state["metric_tracker"]) + # It used to be the case that we tracked `val_metric_per_epoch`. + elif "val_metric_per_epoch" in training_state: + self._metric_tracker.clear() + self._metric_tracker.add_metrics(training_state["val_metric_per_epoch"]) + # And before that we didn't track anything. + else: + self._metric_tracker.clear() + + if isinstance(training_state["epoch"], int): + epoch_to_return = training_state["epoch"] + 1 + else: + epoch_to_return = int(training_state["epoch"].split(".")[0]) + 1 + + # For older checkpoints with batch_num_total missing, default to old behavior where + # it is unchanged. + batch_num_total = training_state.get("batch_num_total") + if batch_num_total is not None: + self._batch_num_total = batch_num_total + + return epoch_to_return + + @classmethod + @overrides + def from_partial_objects( + cls, + model: Model, + serialization_dir: str, + data_loader: DataLoader, + deepspeed_config: DeepspeedConfig, + validation_data_loader: DataLoader = None, + local_rank: int = 0, + patience: int = None, + validation_metric: str = "-loss", + num_epochs: int = 20, + cuda_device: Optional[Union[int, torch.device]] = None, + distributed: bool = None, + world_size: int = 1, + num_gradient_accumulation_steps: int = 1, + no_grad: List[str] = None, + optimizer: Lazy[Optimizer] = Lazy(Optimizer.default), + deepspeed_optimizer: Dict[str, Any] = None, + deepspeed_args: Lazy[DeepspeedArgs] = Lazy(DeepspeedArgs), + moving_average: Lazy[MovingAverage] = None, + checkpointer: Lazy[Checkpointer] = Lazy(DeepspeedCheckpointer), + callbacks: List[Lazy[TrainerCallback]] = None, + trainer_callbacks: List[Lazy[TrainerCallback]] = None, + ) -> "DeepspeedTrainer": + if no_grad: + for name, parameter in model.named_parameters(): + if any(re.search(regex, name) for regex in no_grad): + parameter.requires_grad_(False) + + common_util.log_frozen_and_tunable_parameter_names(model) + + parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] + moving_average_ = ( + None if moving_average is None else moving_average.construct(parameters=parameters) + ) + + checkpointer_ = checkpointer.construct(serialization_dir=serialization_dir) + + if deepspeed_config.optimizer: + optim_ = None + else: + optim_ = optimizer.construct(model_parameters=parameters) + + deepspeed_args_ = deepspeed_args.construct(local_rank=local_rank) or DeepspeedArgs( + local_rank=local_rank + ) + + if not hasattr(data_loader, "batch_size"): + raise ConfigurationError( + "Please specify your batch size in Deepspeed config if not using AllennlpDataLoader." + ) + + model_engine = DeepspeedTrainer._build_engine( + model, + optim_, + deepspeed_config, + deepspeed_args_, + data_loader.batch_size, # type: ignore + num_gradient_accumulation_steps, + ) + + callbacks = callbacks or trainer_callbacks or [] + + callbacks_: List[TrainerCallback] = [] + + for callback in callbacks: + callback_ = callback.construct(serialization_dir=serialization_dir) + callbacks_.append(callback_) + + return cls( + model, + data_loader, + deepspeed_engine=model_engine, + patience=patience, + validation_metric=validation_metric, + validation_data_loader=validation_data_loader, + num_epochs=num_epochs, + serialization_dir=serialization_dir, + cuda_device=cuda_device, + checkpointer=checkpointer_, + moving_average=moving_average_, + callbacks=callbacks_, + distributed=False, + local_rank=local_rank, + world_size=world_size, + num_gradient_accumulation_steps=num_gradient_accumulation_steps, + ) + + @staticmethod + def _build_engine( + model: Model, + optimizer: torch.optim.Optimizer, + deepspeed_config: DeepspeedConfig, + args: DeepspeedArgs, + batch_size: int, + num_gradient_accumulation_steps: int, + ): + if not (optimizer is None or deepspeed_config.optimizer is None): + raise ConfigurationError( + f"Cannot provide both optimizer and deepspeed_optimizer. {optimizer, deepspeed_config.to_dict()}" + ) + + config: Dict[str, Any] = dict( + **{k: v for k, v in deepspeed_config.to_dict().items() if v is not None}, + train_batch_size=batch_size, + gradient_accumulation_steps=num_gradient_accumulation_steps, + ) + ds = DeepSpeedEngine( + args=args, + model=model, + optimizer=optimizer, + model_parameters=model.parameters(), + dist_init_required=False, + config_params=config, + ) + if hasattr(ds, "timers"): + + def mute_log(*args, **kwargs): + pass + + ds.timers.log = mute_log + return ds diff --git a/allennlp/training/learning_rate_schedulers/__init__.py b/allennlp/training/learning_rate_schedulers/__init__.py index 899bf5cb91f..d1301c321ce 100644 --- a/allennlp/training/learning_rate_schedulers/__init__.py +++ b/allennlp/training/learning_rate_schedulers/__init__.py @@ -22,9 +22,15 @@ ExponentialLearningRateScheduler, ReduceOnPlateauLearningRateScheduler, ) -from allennlp.training.learning_rate_schedulers.combined import CombinedLearningRateScheduler +from allennlp.training.learning_rate_schedulers.combined import ( + CombinedLearningRateScheduler, +) from allennlp.training.learning_rate_schedulers.cosine import CosineWithRestarts from allennlp.training.learning_rate_schedulers.noam import NoamLR -from allennlp.training.learning_rate_schedulers.slanted_triangular import SlantedTriangular +from allennlp.training.learning_rate_schedulers.slanted_triangular import ( + SlantedTriangular, +) from allennlp.training.learning_rate_schedulers.polynomial_decay import PolynomialDecay -from allennlp.training.learning_rate_schedulers.linear_with_warmup import LinearWithWarmup +from allennlp.training.learning_rate_schedulers.linear_with_warmup import ( + LinearWithWarmup, +) diff --git a/allennlp/training/learning_rate_schedulers/combined.py b/allennlp/training/learning_rate_schedulers/combined.py index c49e9a26cc8..78f333e064a 100644 --- a/allennlp/training/learning_rate_schedulers/combined.py +++ b/allennlp/training/learning_rate_schedulers/combined.py @@ -4,7 +4,9 @@ import torch from allennlp.common.lazy import Lazy -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) @LearningRateScheduler.register("combined") diff --git a/allennlp/training/learning_rate_schedulers/cosine.py b/allennlp/training/learning_rate_schedulers/cosine.py index d9311dde387..87815d48a1a 100644 --- a/allennlp/training/learning_rate_schedulers/cosine.py +++ b/allennlp/training/learning_rate_schedulers/cosine.py @@ -4,7 +4,9 @@ import numpy as np import torch -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) logger = logging.getLogger(__name__) diff --git a/allennlp/training/learning_rate_schedulers/learning_rate_scheduler.py b/allennlp/training/learning_rate_schedulers/learning_rate_scheduler.py index 84304a6c1ec..e5d3a727060 100644 --- a/allennlp/training/learning_rate_schedulers/learning_rate_scheduler.py +++ b/allennlp/training/learning_rate_schedulers/learning_rate_scheduler.py @@ -58,7 +58,11 @@ class StepLearningRateScheduler(_PyTorchLearningRateSchedulerWrapper): """ def __init__( - self, optimizer: Optimizer, step_size: int, gamma: float = 0.1, last_epoch: int = -1 + self, + optimizer: Optimizer, + step_size: int, + gamma: float = 0.1, + last_epoch: int = -1, ) -> None: lr_scheduler = torch.optim.lr_scheduler.StepLR( optimizer=optimizer, step_size=step_size, gamma=gamma, last_epoch=last_epoch @@ -74,10 +78,17 @@ class MultiStepLearningRateScheduler(_PyTorchLearningRateSchedulerWrapper): """ def __init__( - self, optimizer: Optimizer, milestones: List[int], gamma: float = 0.1, last_epoch: int = -1 + self, + optimizer: Optimizer, + milestones: List[int], + gamma: float = 0.1, + last_epoch: int = -1, ) -> None: lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( - optimizer=optimizer, milestones=milestones, gamma=gamma, last_epoch=last_epoch + optimizer=optimizer, + milestones=milestones, + gamma=gamma, + last_epoch=last_epoch, ) super().__init__(lr_scheduler) diff --git a/allennlp/training/learning_rate_schedulers/linear_with_warmup.py b/allennlp/training/learning_rate_schedulers/linear_with_warmup.py index 09b6c839f9e..98e901f0877 100644 --- a/allennlp/training/learning_rate_schedulers/linear_with_warmup.py +++ b/allennlp/training/learning_rate_schedulers/linear_with_warmup.py @@ -1,7 +1,9 @@ import torch from allennlp.training.learning_rate_schedulers import PolynomialDecay -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) @LearningRateScheduler.register("linear_with_warmup") diff --git a/allennlp/training/learning_rate_schedulers/noam.py b/allennlp/training/learning_rate_schedulers/noam.py index e04aacd46d3..9c17f862c33 100644 --- a/allennlp/training/learning_rate_schedulers/noam.py +++ b/allennlp/training/learning_rate_schedulers/noam.py @@ -1,7 +1,9 @@ from overrides import overrides import torch -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) @LearningRateScheduler.register("noam") diff --git a/allennlp/training/learning_rate_schedulers/polynomial_decay.py b/allennlp/training/learning_rate_schedulers/polynomial_decay.py index 93cb6112455..1a201b99c88 100644 --- a/allennlp/training/learning_rate_schedulers/polynomial_decay.py +++ b/allennlp/training/learning_rate_schedulers/polynomial_decay.py @@ -1,7 +1,9 @@ from overrides import overrides import torch -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) @LearningRateScheduler.register("polynomial_decay") diff --git a/allennlp/training/learning_rate_schedulers/slanted_triangular.py b/allennlp/training/learning_rate_schedulers/slanted_triangular.py index e9166b39864..3567564600c 100644 --- a/allennlp/training/learning_rate_schedulers/slanted_triangular.py +++ b/allennlp/training/learning_rate_schedulers/slanted_triangular.py @@ -4,7 +4,9 @@ from overrides import overrides import torch -from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import LearningRateScheduler +from allennlp.training.learning_rate_schedulers.learning_rate_scheduler import ( + LearningRateScheduler, +) logger = logging.getLogger(__name__) @@ -150,7 +152,10 @@ def get_values(self): if self.freezing_current: # if we are still freezing layers, we restrict the schedule to the current epoch num_steps = actual_num_steps_per_epoch - step = min(self.last_batch_num_total - self.batch_num_total_epoch_end[-1], num_steps) + step = min( + self.last_batch_num_total - self.batch_num_total_epoch_end[-1], + num_steps, + ) else: # otherwise we use the schedule for the rest of training if not self.gradual_unfreezing: diff --git a/allennlp/training/metrics/auc.py b/allennlp/training/metrics/auc.py index 154477a463c..363a44b3fee 100644 --- a/allennlp/training/metrics/auc.py +++ b/allennlp/training/metrics/auc.py @@ -78,10 +78,12 @@ def __call__( self._all_gold_labels = self._all_gold_labels.to(gold_labels.device) self._all_predictions = torch.cat( - [self._all_predictions, torch.masked_select(predictions, mask).float()], dim=0 + [self._all_predictions, torch.masked_select(predictions, mask).float()], + dim=0, ) self._all_gold_labels = torch.cat( - [self._all_gold_labels, torch.masked_select(gold_labels, mask).long()], dim=0 + [self._all_gold_labels, torch.masked_select(gold_labels, mask).long()], + dim=0, ) if is_distributed(): @@ -91,7 +93,8 @@ def __call__( # Check if batch lengths are equal. _all_batch_lengths = [torch.tensor(0) for i in range(world_size)] dist.all_gather( - _all_batch_lengths, torch.tensor(len(self._all_predictions), device=device) + _all_batch_lengths, + torch.tensor(len(self._all_predictions), device=device), ) _all_batch_lengths = [batch_length.item() for batch_length in _all_batch_lengths] diff --git a/allennlp/training/metrics/categorical_accuracy.py b/allennlp/training/metrics/categorical_accuracy.py index 559116a65ab..7ced491810c 100644 --- a/allennlp/training/metrics/categorical_accuracy.py +++ b/allennlp/training/metrics/categorical_accuracy.py @@ -85,7 +85,8 @@ def __call__( # ith entry in gold_labels points to index (0-num_classes) for ith row in max_predictions # For each row check if index pointed by gold_label is was 1 or not (among max scored classes) correct = max_predictions_mask[ - torch.arange(gold_labels.numel(), device=gold_labels.device).long(), gold_labels + torch.arange(gold_labels.numel(), device=gold_labels.device).long(), + gold_labels, ].float() tie_counts = max_predictions_mask.sum(-1) correct /= tie_counts.float() @@ -95,7 +96,7 @@ def __call__( correct *= mask.view(-1, 1) _total_count = mask.sum() else: - _total_count = torch.tensor(gold_labels.numel()) + _total_count = torch.tensor(gold_labels.numel(), device=gold_labels.device) _correct_count = correct.sum() if is_distributed(): diff --git a/allennlp/training/metrics/evalb_bracketing_scorer.py b/allennlp/training/metrics/evalb_bracketing_scorer.py index 074b154acae..55d59d1107c 100644 --- a/allennlp/training/metrics/evalb_bracketing_scorer.py +++ b/allennlp/training/metrics/evalb_bracketing_scorer.py @@ -19,7 +19,11 @@ DEFAULT_EVALB_DIR = os.path.abspath( os.path.join( - os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir, "tools", "EVALB" + os.path.dirname(os.path.realpath(__file__)), + os.pardir, + os.pardir, + "tools", + "EVALB", ) ) diff --git a/allennlp/training/metrics/fbeta_measure.py b/allennlp/training/metrics/fbeta_measure.py index bd8cce644aa..f927d36af76 100644 --- a/allennlp/training/metrics/fbeta_measure.py +++ b/allennlp/training/metrics/fbeta_measure.py @@ -233,7 +233,11 @@ def get_metric(self, reset: bool = False): "fscore": fscore.tolist(), } else: - return {"precision": precision.item(), "recall": recall.item(), "fscore": fscore.item()} + return { + "precision": precision.item(), + "recall": recall.item(), + "fscore": fscore.item(), + } @overrides def reset(self) -> None: diff --git a/allennlp/training/metrics/metric.py b/allennlp/training/metrics/metric.py index 8daab32e14d..b716606c51e 100644 --- a/allennlp/training/metrics/metric.py +++ b/allennlp/training/metrics/metric.py @@ -14,7 +14,10 @@ class Metric(Registrable): supports_distributed = False def __call__( - self, predictions: torch.Tensor, gold_labels: torch.Tensor, mask: Optional[torch.BoolTensor] + self, + predictions: torch.Tensor, + gold_labels: torch.Tensor, + mask: Optional[torch.BoolTensor], ): """ # Parameters diff --git a/allennlp/training/metrics/span_based_f1_measure.py b/allennlp/training/metrics/span_based_f1_measure.py index 17f82600c4b..8c2e41586b8 100644 --- a/allennlp/training/metrics/span_based_f1_measure.py +++ b/allennlp/training/metrics/span_based_f1_measure.py @@ -265,7 +265,9 @@ def get_metric(self, reset: bool = False): all_metrics = {} for tag in all_tags: precision, recall, f1_measure = self._compute_metrics( - self._true_positives[tag], self._false_positives[tag], self._false_negatives[tag] + self._true_positives[tag], + self._false_positives[tag], + self._false_negatives[tag], ) precision_key = "precision" + "-" + tag recall_key = "recall" + "-" + tag diff --git a/allennlp/training/metrics/spearman_correlation.py b/allennlp/training/metrics/spearman_correlation.py index cea2e70e724..b7fc3a5f4fd 100644 --- a/allennlp/training/metrics/spearman_correlation.py +++ b/allennlp/training/metrics/spearman_correlation.py @@ -66,7 +66,8 @@ def __call__( # Check if batch lengths are equal. _all_batch_lengths = [torch.tensor(0) for i in range(world_size)] dist.all_gather( - _all_batch_lengths, torch.tensor(self.total_predictions.shape[0], device=device) + _all_batch_lengths, + torch.tensor(self.total_predictions.shape[0], device=device), ) _all_batch_lengths = [batch_length.item() for batch_length in _all_batch_lengths] diff --git a/allennlp/training/moving_average.py b/allennlp/training/moving_average.py index 205eec973fa..2697b46a4f7 100644 --- a/allennlp/training/moving_average.py +++ b/allennlp/training/moving_average.py @@ -92,7 +92,8 @@ def apply(self, num_updates: Optional[int] = None) -> None: """ if num_updates is not None: decay = min( - self._decay, (self._numerator + num_updates) / (self._denominator + num_updates) + self._decay, + (self._numerator + num_updates) / (self._denominator + num_updates), ) else: decay = self._decay diff --git a/allennlp/training/scheduler.py b/allennlp/training/scheduler.py index 26b115b68ed..46791607690 100644 --- a/allennlp/training/scheduler.py +++ b/allennlp/training/scheduler.py @@ -27,7 +27,10 @@ class Scheduler: """ def __init__( - self, optimizer: torch.optim.Optimizer, param_group_field: str, last_epoch: int = -1 + self, + optimizer: torch.optim.Optimizer, + param_group_field: str, + last_epoch: int = -1, ) -> None: self.optimizer = optimizer self.param_group_field = param_group_field diff --git a/allennlp/training/tensorboard_writer.py b/allennlp/training/tensorboard_writer.py index 7f613afbf9b..4bc045c0d31 100644 --- a/allennlp/training/tensorboard_writer.py +++ b/allennlp/training/tensorboard_writer.py @@ -295,12 +295,25 @@ def log_metrics( # And maybe log to console if log_to_console and val_metric is not None and train_metric is not None: logger.info( - dual_message_template, name.ljust(name_length), train_metric, val_metric + dual_message_template, + name.ljust(name_length), + train_metric, + val_metric, ) elif log_to_console and val_metric is not None: - logger.info(no_train_message_template, name.ljust(name_length), "N/A", val_metric) + logger.info( + no_train_message_template, + name.ljust(name_length), + "N/A", + val_metric, + ) elif log_to_console and train_metric is not None: - logger.info(no_val_message_template, name.ljust(name_length), train_metric, "N/A") + logger.info( + no_val_message_template, + name.ljust(name_length), + train_metric, + "N/A", + ) def enable_activation_logging(self, model: Model) -> None: if self._histogram_interval is not None: diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py index b3aa6fdb056..0d259c2a646 100644 --- a/allennlp/training/trainer.py +++ b/allennlp/training/trainer.py @@ -752,8 +752,8 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) - if self._checkpointer is not None: - self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) + if self._checkpointer is not None: + self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) for callback in self._callbacks: callback.on_batch( @@ -961,7 +961,7 @@ def _try_train(self) -> Tuple[Dict[str, Any], int]: epoch_start_time = time.time() train_metrics = self._train_epoch(epoch) - if self._primary and self._checkpointer is not None: + if self._checkpointer is not None: self._checkpointer.save_checkpoint(epoch, self, save_model_only=True) # Wait for the primary process to finish saving the model checkpoint @@ -1039,7 +1039,7 @@ def _try_train(self) -> Tuple[Dict[str, Any], int]: if self._momentum_scheduler: self._momentum_scheduler.step(this_epoch_val_metric) - if self._primary and self._checkpointer is not None: + if self._checkpointer is not None: self._checkpointer.save_checkpoint( epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() ) diff --git a/allennlp/training/util.py b/allennlp/training/util.py index 09feb3481f3..cae03d8675a 100644 --- a/allennlp/training/util.py +++ b/allennlp/training/util.py @@ -113,7 +113,9 @@ def data_loaders_from_params( train_data_path = params.pop("train_data_path") logger.info("Reading training data from %s", train_data_path) data_loaders["train"] = DataLoader.from_params( - data_loader_params.duplicate(), reader=dataset_reader, data_path=train_data_path + data_loader_params.duplicate(), + reader=dataset_reader, + data_path=train_data_path, ) if not validation and not test: @@ -419,7 +421,9 @@ def description_from_metrics(metrics: Dict[str, float]) -> str: def make_vocab_from_params( - params: Params, serialization_dir: Union[str, PathLike], print_statistics: bool = False + params: Params, + serialization_dir: Union[str, PathLike], + print_statistics: bool = False, ) -> Vocabulary: vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) @@ -435,7 +439,10 @@ def make_vocab_from_params( ) # Do a quick sanity check here. There's no need to load any datasets if the vocab # type is "empty" or "from_files". - if datasets_for_vocab_creation is None and vocab_params.get("type") in {"empty", "from_files"}: + if datasets_for_vocab_creation is None and vocab_params.get("type") in { + "empty", + "from_files", + }: datasets_for_vocab_creation = [] data_loaders: Dict[str, DataLoader] diff --git a/scripts/ai2_internal/resume_daemon.py b/scripts/ai2_internal/resume_daemon.py index 76b29d1796b..ced34f31255 100644 --- a/scripts/ai2_internal/resume_daemon.py +++ b/scripts/ai2_internal/resume_daemon.py @@ -145,7 +145,12 @@ def get_status(self, experiment_id: str) -> BeakerStatus: return status def resume(self, experiment_id: str) -> str: - command = ["beaker", "experiment", "resume", f"--experiment-name={experiment_id}"] + command = [ + "beaker", + "experiment", + "resume", + f"--experiment-name={experiment_id}", + ] # Small delay to avoid thrashing Beaker. time.sleep(BEAKER_QUERY_INTERVAL_SECONDS) return subprocess.check_output(command, universal_newlines=True).strip() @@ -206,7 +211,12 @@ def resume(connection: Connection, beaker: BeakerWrapper) -> None: ) cursor.execute( "INSERT INTO active_experiments VALUES (?, ?, ?, ?)", - (new_experiment_id, original_id, max_resumes, current_resume + 1), + ( + new_experiment_id, + original_id, + max_resumes, + current_resume + 1, + ), ) connection.commit() else: diff --git a/scripts/ai2_internal/run_with_beaker.py b/scripts/ai2_internal/run_with_beaker.py index 262dfde4f3f..b5ce82bf482 100755 --- a/scripts/ai2_internal/run_with_beaker.py +++ b/scripts/ai2_internal/run_with_beaker.py @@ -15,7 +15,8 @@ random_int = random.randint(0, 2 ** 32) sys.path.insert( - 0, os.path.dirname(os.path.abspath(os.path.join(os.path.join(__file__, os.pardir), os.pardir))) + 0, + os.path.dirname(os.path.abspath(os.path.join(os.path.join(__file__, os.pardir), os.pardir))), ) from allennlp.common.params import Params @@ -62,17 +63,25 @@ def main(param_file: str, args: argparse.Namespace): print("Create a Beaker image...") image = subprocess.check_output( - f"beaker image create --quiet {docker_image}", shell=True, universal_newlines=True + f"beaker image create --quiet {docker_image}", + shell=True, + universal_newlines=True, ).strip() print(f" Image created: {docker_image}") config_dataset_id = subprocess.check_output( - f"beaker dataset create --quiet {params_dir}/*", shell=True, universal_newlines=True + f"beaker dataset create --quiet {params_dir}/*", + shell=True, + universal_newlines=True, ).strip() # Arguments that differ between preemptible and regular machine execution. if args.preemptible: - allennlp_prefix = ["/stage/allennlp/resumable_train.sh", "/output", "/config/config.json"] + allennlp_prefix = [ + "/stage/allennlp/resumable_train.sh", + "/output", + "/config/config.json", + ] else: allennlp_prefix = [ "python", @@ -132,7 +141,14 @@ def main(param_file: str, args: argparse.Namespace): output.write(json.dumps(config, indent=4)) print(f"Beaker spec written to {output_path}.") - experiment_command = ["beaker", "experiment", "create", "--quiet", "--file", output_path] + experiment_command = [ + "beaker", + "experiment", + "create", + "--quiet", + "--file", + output_path, + ] if args.name: experiment_command.append("--name") experiment_command.append(args.name.replace(" ", "-")) @@ -175,10 +191,14 @@ def resume_command(experiment_id): parser.add_argument("param_file", type=str, help="The model configuration file.") parser.add_argument("--name", type=str, help="A name for the experiment.") parser.add_argument( - "--spec_output_path", type=str, help="The destination to write the experiment spec." + "--spec_output_path", + type=str, + help="The destination to write the experiment spec.", ) parser.add_argument( - "--dry-run", action="store_true", help="If specified, an experiment will not be created." + "--dry-run", + action="store_true", + help="If specified, an experiment will not be created.", ) parser.add_argument( "--image", type=str, help="The image to use (if unspecified one will be built)" @@ -198,11 +218,15 @@ def resume_command(experiment_id): ) parser.add_argument("--cpu", help="CPUs to reserve for this experiment (e.g., 0.5)") parser.add_argument( - "--gpu-count", default=1, help="GPUs to use for this experiment (e.g., 1 (default))" + "--gpu-count", + default=1, + help="GPUs to use for this experiment (e.g., 1 (default))", ) parser.add_argument("--memory", help="Memory to reserve for this experiment (e.g., 1GB)") parser.add_argument( - "--preemptible", action="store_true", help="Allow task to run on preemptible hardware" + "--preemptible", + action="store_true", + help="Allow task to run on preemptible hardware", ) parser.add_argument( "--max-resumes", diff --git a/scripts/build_docs_config.py b/scripts/build_docs_config.py index 36f1105e385..67e4296535e 100644 --- a/scripts/build_docs_config.py +++ b/scripts/build_docs_config.py @@ -22,7 +22,8 @@ def parse_args(): parser.add_argument("source_yaml", help="Path to the mkdocs skeleton config file.") parser.add_argument("docs_root", help="The root of the markdown docs folder.") parser.add_argument( - "api_docs_path", help="The root of the API docs within the markdown docs root folder." + "api_docs_path", + help="The root of the API docs within the markdown docs root folder.", ) parser.add_argument("--docs-version", type=str, default=f"v{VERSION}") return parser.parse_args() diff --git a/scripts/close_stale_issues.py b/scripts/close_stale_issues.py index d84062099e6..7612ad75aa0 100644 --- a/scripts/close_stale_issues.py +++ b/scripts/close_stale_issues.py @@ -4,7 +4,12 @@ from github import Github -LABELS_TO_EXEMPT = ["contributions welcome", "merge when ready", "under development", "help wanted"] +LABELS_TO_EXEMPT = [ + "contributions welcome", + "merge when ready", + "under development", + "help wanted", +] def main(): diff --git a/scripts/py2md.py b/scripts/py2md.py index 798febaa6df..c6baf3ba6d1 100755 --- a/scripts/py2md.py +++ b/scripts/py2md.py @@ -333,7 +333,10 @@ def _format_function_signature( > 60 ): signature_args = ",\n ".join( - filter(lambda s: s.strip() not in ("", ","), (str(arg) for arg in func.args)) + filter( + lambda s: s.strip() not in ("", ","), + (str(arg) for arg in func.args), + ) ) parts.append("(\n " + signature_args + "\n)") else: diff --git a/scripts/train_fixtures.py b/scripts/train_fixtures.py index bbb5eb6246a..a7e0920b761 100755 --- a/scripts/train_fixtures.py +++ b/scripts/train_fixtures.py @@ -54,7 +54,8 @@ def train_fixture_gpu(config_prefix: str) -> None: # now copy back the weights and and archived model shutil.copy(os.path.join(tempdir, "best.th"), os.path.join(serialization_dir, "best_gpu.th")) shutil.copy( - os.path.join(tempdir, "model.tar.gz"), os.path.join(serialization_dir, "model_gpu.tar.gz") + os.path.join(tempdir, "model.tar.gz"), + os.path.join(serialization_dir, "model_gpu.tar.gz"), ) diff --git a/setup.py b/setup.py index 72a99236b89..8823174e662 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,7 @@ "lmdb", "more-itertools", ], + extras_require={"deepspeed": ["deepspeed>=0.3.7,<=0.3.8"]}, entry_points={"console_scripts": ["allennlp=allennlp.__main__:run"]}, include_package_data=True, python_requires=">=3.6.1", diff --git a/tests/commands/cached_path_test.py b/tests/commands/cached_path_test.py index 1a293d64044..12e9dfb325c 100644 --- a/tests/commands/cached_path_test.py +++ b/tests/commands/cached_path_test.py @@ -8,13 +8,25 @@ class TestCachedPathCommand(AllenNlpTestCase): def test_local_file(self, capsys): - sys.argv = ["allennlp", "cached-path", "--cache-dir", str(self.TEST_DIR), "README.md"] + sys.argv = [ + "allennlp", + "cached-path", + "--cache-dir", + str(self.TEST_DIR), + "README.md", + ] main() captured = capsys.readouterr() assert "README.md" in captured.out def test_inspect_empty_cache(self, capsys): - sys.argv = ["allennlp", "cached-path", "--cache-dir", str(self.TEST_DIR), "--inspect"] + sys.argv = [ + "allennlp", + "cached-path", + "--cache-dir", + str(self.TEST_DIR), + "--inspect", + ] main() captured = capsys.readouterr() assert "Cached resources:" in captured.out diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py index 8ad4e624df5..8c93159105b 100644 --- a/tests/commands/evaluate_test.py +++ b/tests/commands/evaluate_test.py @@ -127,7 +127,13 @@ def test_evaluate_works_with_vocab_expansion(self): embedding_sources_mapping = json.dumps( {"_text_field_embedder.token_embedder_tokens": embeddings_filename} ) - kebab_args = ["evaluate", archive_path, evaluate_data_path, "--cuda-device", "-1"] + kebab_args = [ + "evaluate", + archive_path, + evaluate_data_path, + "--cuda-device", + "-1", + ] # TODO(mattg): the unawarded_embeddings.gz file above doesn't exist, but this test still # passes. This suggests that vocab extension in evaluate isn't currently doing anything, diff --git a/tests/commands/find_learning_rate_test.py b/tests/commands/find_learning_rate_test.py index f33cdb1f924..d9b92ada286 100644 --- a/tests/commands/find_learning_rate_test.py +++ b/tests/commands/find_learning_rate_test.py @@ -36,7 +36,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), @@ -111,7 +116,12 @@ def test_find_learning_rate_args(self): FindLearningRate().add_subparser(subparsers) for serialization_arg in ["-s", "--serialization-dir"]: - raw_args = ["find-lr", "path/to/params", serialization_arg, "serialization_dir"] + raw_args = [ + "find-lr", + "path/to/params", + serialization_arg, + "serialization_dir", + ] args = parser.parse_args(raw_args) @@ -160,7 +170,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), diff --git a/tests/commands/main_test.py b/tests/commands/main_test.py index 5931407bf14..f7fb45f08c0 100644 --- a/tests/commands/main_test.py +++ b/tests/commands/main_test.py @@ -108,7 +108,13 @@ def test_other_modules(self): serialization_dir = self.TEST_DIR / "serialization" # Run train with using the non-allennlp module. - sys.argv = ["allennlp", "train", str(config_path), "-s", str(serialization_dir)] + sys.argv = [ + "allennlp", + "train", + str(config_path), + "-s", + str(serialization_dir), + ] # Shouldn't be able to find the model. with pytest.raises(ConfigurationError): diff --git a/tests/commands/predict_test.py b/tests/commands/predict_test.py index 644ce442d00..ec7898f3c26 100644 --- a/tests/commands/predict_test.py +++ b/tests/commands/predict_test.py @@ -85,7 +85,13 @@ def test_works_with_known_model(self): assert len(results) == 2 for result in results: - assert set(result.keys()) == {"label", "logits", "probs", "tokens", "token_ids"} + assert set(result.keys()) == { + "label", + "logits", + "probs", + "tokens", + "token_ids", + } shutil.rmtree(self.tempdir) @@ -111,7 +117,14 @@ def test_using_dataset_reader_works_with_known_model(self): assert len(results) == 3 for result in results: - assert set(result.keys()) == {"label", "logits", "loss", "probs", "tokens", "token_ids"} + assert set(result.keys()) == { + "label", + "logits", + "loss", + "probs", + "tokens", + "token_ids", + } shutil.rmtree(self.tempdir) @@ -284,7 +297,13 @@ def test_batch_prediction_works_with_known_model(self): assert len(results) == 2 for result in results: - assert set(result.keys()) == {"label", "logits", "probs", "tokens", "token_ids"} + assert set(result.keys()) == { + "label", + "logits", + "probs", + "tokens", + "token_ids", + } shutil.rmtree(self.tempdir) @@ -458,7 +477,13 @@ def test_other_modules(self): assert len(results) == 2 # Overridden predictor should output extra field for result in results: - assert set(result.keys()) == {"label", "logits", "probs", "tokens", "token_ids"} + assert set(result.keys()) == { + "label", + "logits", + "probs", + "tokens", + "token_ids", + } def test_alternative_file_formats(self): @Predictor.register("classification-csv") diff --git a/tests/commands/print_results_test.py b/tests/commands/print_results_test.py index 3628c66e4bd..f68198ef1f3 100644 --- a/tests/commands/print_results_test.py +++ b/tests/commands/print_results_test.py @@ -28,10 +28,12 @@ def setup_method(self): open(os.path.join(self.directory1 / "metrics.json"), "w+"), ) json.dump( - {"train": 4, "dev": 5}, open(os.path.join(self.directory2 / "metrics.json"), "w+") + {"train": 4, "dev": 5}, + open(os.path.join(self.directory2 / "metrics.json"), "w+"), ) json.dump( - {"train": 6, "dev": 7}, open(os.path.join(self.directory3 / "cool_metrics.json"), "w+") + {"train": 6, "dev": 7}, + open(os.path.join(self.directory3 / "cool_metrics.json"), "w+"), ) def test_print_results(self): diff --git a/tests/commands/train_test.py b/tests/commands/train_test.py index 31f416e900b..462f6fb93d0 100644 --- a/tests/commands/train_test.py +++ b/tests/commands/train_test.py @@ -13,10 +13,15 @@ import pytest import torch -from allennlp.commands.train import Train, train_model, train_model_from_args, TrainModel +from allennlp.commands.train import ( + Train, + train_model, + train_model_from_args, + TrainModel, +) from allennlp.common import Params from allennlp.common.checks import ConfigurationError -from allennlp.common.testing import AllenNlpTestCase, cpu_or_gpu +from allennlp.common.testing import AllenNlpTestCase, cpu_or_gpu, requires_multi_gpu from allennlp.data import Vocabulary from allennlp.data.data_loaders import TensorDict from allennlp.models import load_archive, Model @@ -82,7 +87,12 @@ class TestTrain(AllenNlpTestCase): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -115,7 +125,10 @@ def test_train_model(self): # It's also not OK if serialization dir is a real serialization dir: with pytest.raises(ConfigurationError): - train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, "test_train_model")) + train_model( + params(), + serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), + ) # But it's OK if serialization dir exists and --recover is specified: train_model( @@ -126,7 +139,9 @@ def test_train_model(self): # It's ok serialization dir exists and --force is specified (it will be deleted): train_model( - params(), serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), force=True + params(), + serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), + force=True, ) # But --force and --recover cannot both be specified @@ -167,7 +182,10 @@ def test_force_gpu(self): _seen_training_devices.clear() if torch.cuda.device_count() == 0: with pytest.raises(ConfigurationError): - train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_force_gpu")) + train_model( + params, + serialization_dir=os.path.join(self.TEST_DIR, "test_force_gpu"), + ) else: train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_force_gpu")) assert len(_seen_training_devices) == 1 @@ -203,7 +221,12 @@ def test_train_model_distributed(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -237,6 +260,69 @@ def test_train_model_distributed(self): # Check we can load the serialized model assert load_archive(out_dir).model + @requires_multi_gpu + def test_train_model_deepspeed(self): + if torch.cuda.device_count() >= 2: + devices = [0, 1] + else: + devices = [-1, -1] + + params = lambda: Params( + { + "model": { + "type": "simple_tagger", + "text_field_embedder": { + "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} + }, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, + }, + "dataset_reader": {"type": "sequence_tagging"}, + "train_data_path": SEQUENCE_TAGGING_DATA_PATH, + "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, + "data_loader": {"batch_size": 2}, + "trainer": { + "type": "deepspeed", + "deepspeed_config": { + "zero_optimization": {"stage": 2}, + "fp16": { + "enabled": True, + }, + }, + "num_epochs": 2, + "optimizer": "adam", + }, + "distributed": {"cuda_devices": devices}, + } + ) + + out_dir = os.path.join(self.TEST_DIR, "test_distributed_train") + train_model(params(), serialization_dir=out_dir) + + # Check that some logs specific to distributed + # training are where we expect. + serialized_files = os.listdir(out_dir) + assert "out_worker0.log" in serialized_files + assert "out_worker1.log" in serialized_files + assert "model.tar.gz" in serialized_files + assert "metrics.json" in serialized_files + + # Make sure the metrics look right. + with open(os.path.join(out_dir, "metrics.json")) as f: + metrics = json.load(f) + assert metrics["peak_worker_0_memory_MB"] > 0 + assert metrics["peak_worker_1_memory_MB"] > 0 + if torch.cuda.device_count() >= 2: + assert metrics["peak_gpu_0_memory_MB"] > 0 + assert metrics["peak_gpu_1_memory_MB"] > 0 + + # Check we can load the serialized model + assert load_archive(out_dir).model + @cpu_or_gpu @pytest.mark.parametrize("max_instances_in_memory", [None, 10]) def test_train_model_distributed_with_sharded_reader(self, max_instances_in_memory): @@ -252,9 +338,17 @@ def test_train_model_distributed_with_sharded_reader(self, max_instances_in_memo "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, + }, + "dataset_reader": { + "type": "sharded", + "base_reader": {"type": "sequence_tagging"}, }, - "dataset_reader": {"type": "sharded", "base_reader": {"type": "sequence_tagging"}}, "train_data_path": SEQUENCE_TAGGING_SHARDS_PATH, "validation_data_path": SEQUENCE_TAGGING_SHARDS_PATH, "data_loader": { @@ -341,7 +435,12 @@ def test_train_model_distributed_without_sharded_reader(self, max_instances_in_m "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -423,7 +522,12 @@ def test_distributed_raises_error_with_no_gpus(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -434,7 +538,10 @@ def test_distributed_raises_error_with_no_gpus(self): } ) with pytest.raises(ConfigurationError): - train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_train_model")) + train_model( + params, + serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), + ) def test_train_saves_all_keys_in_config(self): params = Params( @@ -444,7 +551,12 @@ def test_train_saves_all_keys_in_config(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "pytorch_seed": 42, "numpy_seed": 42, @@ -476,7 +588,12 @@ def test_error_is_throw_when_cuda_device_is_not_available(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": "test_fixtures/data/sequence_tagging.tsv", @@ -491,7 +608,10 @@ def test_error_is_throw_when_cuda_device_is_not_available(self): ) with pytest.raises(ConfigurationError, match="Experiment specified"): - train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_train_model")) + train_model( + params, + serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"), + ) def test_train_with_test_set(self): params = Params( @@ -501,7 +621,12 @@ def test_train_with_test_set(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -555,7 +680,12 @@ def on_batch( "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -572,7 +702,8 @@ def on_batch( } ) train_model( - params.duplicate(), serialization_dir=os.path.join(self.TEST_DIR, "train_normal") + params.duplicate(), + serialization_dir=os.path.join(self.TEST_DIR, "train_normal"), ) assert batch_callback_counter == last_num_steps_per_epoch * number_of_epochs batch_callback_counter = 0 @@ -581,7 +712,8 @@ def on_batch( original_batch_size = params["data_loader"]["batch_size"] params["data_loader"]["batch_size"] = 1 train_model( - params.duplicate(), serialization_dir=os.path.join(self.TEST_DIR, "train_with_bs1") + params.duplicate(), + serialization_dir=os.path.join(self.TEST_DIR, "train_with_bs1"), ) assert batch_callback_counter == last_num_steps_per_epoch * number_of_epochs batch_callback_counter = 0 @@ -600,7 +732,12 @@ def test_train_args(self): Train().add_subparser(subparsers) for serialization_arg in ["-s", "--serialization-dir"]: - raw_args = ["train", "path/to/params", serialization_arg, "serialization_dir"] + raw_args = [ + "train", + "path/to/params", + serialization_arg, + "serialization_dir", + ] args = parser.parse_args(raw_args) @@ -623,7 +760,10 @@ def test_train_model_can_instantiate_from_params(self): # Can instantiate from base class params TrainModel.from_params( - params=params, serialization_dir=self.TEST_DIR, local_rank=0, batch_weight_key="" + params=params, + serialization_dir=self.TEST_DIR, + local_rank=0, + batch_weight_key="", ) def test_train_can_fine_tune_model_from_archive(self): @@ -631,7 +771,10 @@ def test_train_can_fine_tune_model_from_archive(self): self.FIXTURES_ROOT / "basic_classifier" / "experiment_from_archive.jsonnet" ) train_loop = TrainModel.from_params( - params=params, serialization_dir=self.TEST_DIR, local_rank=0, batch_weight_key="" + params=params, + serialization_dir=self.TEST_DIR, + local_rank=0, + batch_weight_key="", ) train_loop.run() @@ -652,7 +795,12 @@ def test_train_nograd_regex(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, @@ -662,7 +810,11 @@ def test_train_nograd_regex(self): } ) serialization_dir = os.path.join(self.TEST_DIR, "test_train_nograd") - regex_lists = [[], [".*text_field_embedder.*"], [".*text_field_embedder.*", ".*encoder.*"]] + regex_lists = [ + [], + [".*text_field_embedder.*"], + [".*text_field_embedder.*", ".*encoder.*"], + ] for regex_list in regex_lists: params = params_get() params["trainer"]["no_grad"] = regex_list @@ -694,7 +846,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), @@ -731,7 +888,16 @@ def test_dry_run_makes_vocab(self): tokens = [line.strip() for line in f] tokens.sort() - assert tokens == [".", "@@UNKNOWN@@", "animals", "are", "birds", "cats", "dogs", "snakes"] + assert tokens == [ + ".", + "@@UNKNOWN@@", + "animals", + "are", + "birds", + "cats", + "dogs", + "snakes", + ] with open(vocab_path / "labels.txt") as f: labels = [line.strip() for line in f] diff --git a/tests/common/file_utils_test.py b/tests/common/file_utils_test.py index 9e56781a196..6e2e500a2e8 100644 --- a/tests/common/file_utils_test.py +++ b/tests/common/file_utils_test.py @@ -121,13 +121,20 @@ def mocked_http_etag(url: str): url = "https://github.com/allenai/allennlp/blob/master/some-fake-resource" # We'll create two cached versions of this fake resource using two different etags. - etags = ['W/"3e5885bfcbf4c47bc4ee9e2f6e5ea916"', 'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea918"'] + etags = [ + 'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea916"', + 'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea918"', + ] filenames = [ os.path.join(self.TEST_DIR, _resource_to_filename(url, etag)) for etag in etags ] for filename, etag in zip(filenames, etags): meta = _Meta( - resource=url, cached_path=filename, creation_time=time.time(), etag=etag, size=2341 + resource=url, + cached_path=filename, + creation_time=time.time(), + etag=etag, + size=2341, ) meta.to_file() with open(filename, "w") as f: @@ -216,7 +223,10 @@ def test_resource_to_filename_with_etags_eliminates_quotes(self): def test_split_s3_path(self): # Test splitting good urls. - assert _split_s3_path("s3://my-bucket/subdir/file.txt") == ("my-bucket", "subdir/file.txt") + assert _split_s3_path("s3://my-bucket/subdir/file.txt") == ( + "my-bucket", + "subdir/file.txt", + ) assert _split_s3_path("s3://my-bucket/file.txt") == ("my-bucket", "file.txt") # Test splitting bad urls. @@ -406,7 +416,9 @@ def test_remove_entries(self): ) self.create_cache_entry("http://other.fake.datastore.com/glove.txt.gz", "etag-4") self.create_cache_entry( - "http://other.fake.datastore.com/glove.txt.gz", "etag-5", as_extraction_dir=True + "http://other.fake.datastore.com/glove.txt.gz", + "etag-5", + as_extraction_dir=True, ) reclaimed_space = remove_cache_entries(["http://fake.*"], cache_dir=self.TEST_DIR) @@ -430,11 +442,13 @@ def setup_method(self): super().setup_method() self.tar_file = self.TEST_DIR / "utf-8.tar.gz" shutil.copyfile( - self.FIXTURES_ROOT / "utf-8_sample" / "archives" / "utf-8.tar.gz", self.tar_file + self.FIXTURES_ROOT / "utf-8_sample" / "archives" / "utf-8.tar.gz", + self.tar_file, ) self.zip_file = self.TEST_DIR / "utf-8.zip" shutil.copyfile( - self.FIXTURES_ROOT / "utf-8_sample" / "archives" / "utf-8.zip", self.zip_file + self.FIXTURES_ROOT / "utf-8_sample" / "archives" / "utf-8.zip", + self.zip_file, ) def check_extracted(self, extracted: str): diff --git a/tests/common/from_params_test.py b/tests/common/from_params_test.py index dfd479ffe3d..cbb5a012fbd 100644 --- a/tests/common/from_params_test.py +++ b/tests/common/from_params_test.py @@ -4,7 +4,12 @@ import torch from allennlp.common import Lazy, Params, Registrable -from allennlp.common.from_params import FromParams, takes_arg, remove_optional, create_kwargs +from allennlp.common.from_params import ( + FromParams, + takes_arg, + remove_optional, + create_kwargs, +) from allennlp.common.testing import AllenNlpTestCase from allennlp.data import DataLoader, DatasetReader, Tokenizer from allennlp.models import Model @@ -344,7 +349,10 @@ def __init__(self, a: Union[float, int]) -> None: int_param_str = '{"a": 1}' import json - for expected_type, param_str in [(int, int_param_str), (float, float_param_str)]: + for expected_type, param_str in [ + (int, int_param_str), + (float, float_param_str), + ]: for cls in [IntFloat, FloatInt]: c = cls.from_params(Params(json.loads(param_str))) assert type(c.a) == expected_type @@ -382,7 +390,10 @@ def __init__(self, items: Dict[str, A]) -> None: params = Params( { "type": "d", - "items": {"first": {"type": "b", "size": 1}, "second": {"type": "b", "size": 2}}, + "items": { + "first": {"type": "b", "size": 1}, + "second": {"type": "b", "size": 2}, + }, } ) d = C.from_params(params) @@ -465,7 +476,10 @@ def __init__(self, items: Tuple[A, C]) -> None: self.items = items params = Params( - {"type": "f", "items": [{"type": "b", "size": 1}, {"type": "d", "name": "item2"}]} + { + "type": "f", + "items": [{"type": "b", "size": 1}, {"type": "d", "name": "item2"}], + } ) f = E.from_params(params) @@ -832,7 +846,10 @@ def __init__(self, items: Mapping[str, A]) -> None: params = Params( { "type": "d", - "items": {"first": {"type": "b", "size": 1}, "second": {"type": "b", "size": 2}}, + "items": { + "first": {"type": "b", "size": 1}, + "second": {"type": "b", "size": 2}, + }, } ) d = C.from_params(params) @@ -1018,7 +1035,8 @@ def __init__(self, a: int, b: str = None, **kwargs) -> None: assert foo.c is None foo = Bar.from_params( - params=Params({"type": "foo", "a": 2, "b": "hi", "c": {"2": "3"}}), extra="4" + params=Params({"type": "foo", "a": 2, "b": "hi", "c": {"2": "3"}}), + extra="4", ) assert foo.a == 2 assert foo.b == "hi" diff --git a/tests/common/params_test.py b/tests/common/params_test.py index ba7b5996a4e..c67490dbea5 100644 --- a/tests/common/params_test.py +++ b/tests/common/params_test.py @@ -65,7 +65,10 @@ def test_overrides(self, input_type): def test_unflatten(self): flattened = {"a.b.c": 1, "a.b.d": 0, "a.e.f.g.h": 2, "b": 3} unflattened = unflatten(flattened) - assert unflattened == {"a": {"b": {"c": 1, "d": 0}, "e": {"f": {"g": {"h": 2}}}}, "b": 3} + assert unflattened == { + "a": {"b": {"c": 1, "d": 0}, "e": {"f": {"g": {"h": 2}}}}, + "b": 3, + } # should do nothing to a non-flat dictionary assert unflatten(unflattened) == unflattened diff --git a/tests/common/util_test.py b/tests/common/util_test.py index 60ff5551690..196b61a23cf 100644 --- a/tests/common/util_test.py +++ b/tests/common/util_test.py @@ -38,8 +38,20 @@ def test_lazy_groups_of(self): def test_pad_sequence_to_length(self): assert util.pad_sequence_to_length([1, 2, 3], 5) == [1, 2, 3, 0, 0] - assert util.pad_sequence_to_length([1, 2, 3], 5, default_value=lambda: 2) == [1, 2, 3, 2, 2] - assert util.pad_sequence_to_length([1, 2, 3], 5, padding_on_right=False) == [0, 0, 1, 2, 3] + assert util.pad_sequence_to_length([1, 2, 3], 5, default_value=lambda: 2) == [ + 1, + 2, + 3, + 2, + 2, + ] + assert util.pad_sequence_to_length([1, 2, 3], 5, padding_on_right=False) == [ + 0, + 0, + 1, + 2, + 3, + ] def test_namespace_match(self): assert util.namespace_match("*tags", "tags") diff --git a/tests/data/data_loaders/multiprocess_data_loader_test.py b/tests/data/data_loaders/multiprocess_data_loader_test.py index e0197edee71..32fe62bae82 100644 --- a/tests/data/data_loaders/multiprocess_data_loader_test.py +++ b/tests/data/data_loaders/multiprocess_data_loader_test.py @@ -29,7 +29,9 @@ class MockDatasetReader(DatasetReader): def __init__(self, model: str = "epwalsh/bert-xsmall-dummy", **kwargs) -> None: super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + manual_distributed_sharding=True, + manual_multiprocess_sharding=True, + **kwargs, ) self.tokenizer = PretrainedTransformerTokenizer(model) self.token_indexers = {"tokens": PretrainedTransformerIndexer(model)} @@ -104,7 +106,12 @@ def test_error_raised_when_text_fields_contain_token_indexers(max_instances_in_m [ dict(max_instances_in_memory=10, num_workers=2, batch_size=1), dict(num_workers=2, batch_size=1), - dict(max_instances_in_memory=10, num_workers=2, start_method="spawn", batch_size=1), + dict( + max_instances_in_memory=10, + num_workers=2, + start_method="spawn", + batch_size=1, + ), dict(num_workers=2, start_method="spawn", batch_size=1), dict(max_instances_in_memory=10, num_workers=0, batch_size=1), dict(num_workers=0, batch_size=1), diff --git a/tests/data/data_loaders/multitask_data_loader_test.py b/tests/data/data_loaders/multitask_data_loader_test.py index 35b28dfb721..0b69077b77c 100644 --- a/tests/data/data_loaders/multitask_data_loader_test.py +++ b/tests/data/data_loaders/multitask_data_loader_test.py @@ -9,7 +9,10 @@ from allennlp.data.dataset_readers import MultiTaskDatasetReader from allennlp.data.data_loaders.multitask_data_loader import MultiTaskDataLoader from allennlp.data.data_loaders.multitask_scheduler import RoundRobinScheduler -from allennlp.data.data_loaders.multitask_epoch_sampler import UniformSampler, WeightedSampler +from allennlp.data.data_loaders.multitask_epoch_sampler import ( + UniformSampler, + WeightedSampler, +) class FakeDatasetReaderA(DatasetReader): diff --git a/tests/data/dataloader_test.py b/tests/data/dataloader_test.py new file mode 100644 index 00000000000..cb422f61945 --- /dev/null +++ b/tests/data/dataloader_test.py @@ -0,0 +1,46 @@ +from typing import Iterable + +import pytest + +from allennlp.data.fields import LabelField +from allennlp.data.instance import Instance +from allennlp.data.dataloader import PyTorchDataLoader +from allennlp.data.dataset_readers.dataset_reader import DatasetReader + + +@pytest.mark.parametrize("lazy", (True, False)) +def test_loader_uses_all_instances_when_batches_per_epochs_set(lazy): + NUM_INSTANCES = 20 + BATCH_SIZE = 2 + BATCHES_PER_EPOCH = 3 + EPOCHS = 4 + + class FakeDatasetReader(DatasetReader): + def _read(self, filename: str) -> Iterable[Instance]: + for i in range(NUM_INSTANCES): + yield Instance({"index": LabelField(i, skip_indexing=True)}) + + reader = FakeDatasetReader(lazy=lazy) + dataset = reader.read("blah") + + loader = PyTorchDataLoader(dataset, batch_size=BATCH_SIZE, batches_per_epoch=BATCHES_PER_EPOCH) + epoch_batches = [] + for epoch in range(EPOCHS): + batches = [] + for batch in loader: + instances = [] + for index in batch["index"]: + instances.append(index) + batches.append(instances) + epoch_batches.append(batches) + + assert epoch_batches == [ + # Epoch 0. + [[0, 1], [2, 3], [4, 5]], + # Epoch 1. + [[6, 7], [8, 9], [10, 11]], + # Epoch 2. + [[12, 13], [14, 15], [16, 17]], + # Epoch 3. + [[18, 19], [0, 1], [2, 3]], + ] diff --git a/tests/data/dataset_readers/babi_reader_test.py b/tests/data/dataset_readers/babi_reader_test.py index 687f548ed10..3fcf244e652 100644 --- a/tests/data/dataset_readers/babi_reader_test.py +++ b/tests/data/dataset_readers/babi_reader_test.py @@ -1,15 +1,18 @@ import pytest from allennlp.common import Params +from allennlp.common.util import ensure_list from allennlp.data.dataset_readers import BabiReader from allennlp.common.testing import AllenNlpTestCase class TestBAbIReader: - @pytest.mark.parametrize("keep_sentences", [False, True]) - def test_read_from_file(self, keep_sentences): - reader = BabiReader(keep_sentences=keep_sentences) - instances = list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "babi.txt")) + @pytest.mark.parametrize( + "keep_sentences, lazy", [(False, False), (False, True), (True, False), (True, True)] + ) + def test_read_from_file(self, keep_sentences, lazy): + reader = BabiReader(keep_sentences=keep_sentences, lazy=lazy) + instances = ensure_list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "babi.txt")) assert len(instances) == 8 if keep_sentences: diff --git a/tests/data/dataset_readers/dataset_reader_test.py b/tests/data/dataset_readers/dataset_reader_test.py index aedebefded3..22eab7abaeb 100644 --- a/tests/data/dataset_readers/dataset_reader_test.py +++ b/tests/data/dataset_readers/dataset_reader_test.py @@ -1,85 +1,287 @@ -from itertools import islice -from typing import Optional, List, Set +from collections import deque +import os +import shutil +from typing import Optional, NamedTuple, List +from filelock import FileLock import pytest import torch.distributed as dist +from allennlp.common.testing import AllenNlpTestCase from allennlp.common import util as common_util +from allennlp.common.checks import ConfigurationError from allennlp.data import Instance +from allennlp.data.dataloader import PyTorchDataLoader from allennlp.data.dataset_readers import ( + dataset_reader, DatasetReader, - WorkerInfo, + TextClassificationJsonReader, ) +from allennlp.data.dataset_readers.dataset_reader import AllennlpLazyDataset from allennlp.data.fields import LabelField -TOTAL_INSTANCES = 100 +def mock_collate_fn(item): + return item[0] -class MockDatasetReader(DatasetReader): - def _read(self, file_path): - for i in range(TOTAL_INSTANCES): - yield self.text_to_instance(i) +class TestDatasetReader(AllenNlpTestCase): + def setup_method(self): + super().setup_method() + self.cache_directory = str(AllenNlpTestCase.FIXTURES_ROOT / "data_cache" / "with_prefix") - def text_to_instance(self, index: int): # type: ignore - return Instance({"index": LabelField(index, skip_indexing=True)}) + def teardown_method(self): + super().teardown_method() + if os.path.exists(self.cache_directory): + shutil.rmtree(self.cache_directory) + def test_lazy_dataset_can_be_iterated_through_multiple_times(self): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(lazy=True) + instances = reader.read(data_file) + assert isinstance(instances, AllennlpLazyDataset) -class MockMmpsDatasetReader(DatasetReader): - """ - Implements manual multi-process sharding (MMPS). - """ + first_pass_instances = list(instances) + assert len(first_pass_instances) > 2 + second_pass_instances = list(instances) + assert first_pass_instances == second_pass_instances - def __init__(self, **kwargs) -> None: - super().__init__(manual_multiprocess_sharding=True, **kwargs) + def test_read_only_creates_cache_file_once(self): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(cache_directory=self.cache_directory) + cache_file = reader._get_cache_location_for_file_path(str(data_file)) - def _read(self, file_path): - start_index = 0 - step_size = 1 - worker_info = self.get_worker_info() - if worker_info is not None: - start_index += worker_info.id - step_size *= worker_info.num_workers - for i in islice(range(TOTAL_INSTANCES), start_index, None, step_size): - yield self.text_to_instance(i) + # The first read will create the cache. + reader.read(data_file) + assert os.path.exists(cache_file) + with open(cache_file, "r") as in_file: + cache_contents = in_file.read() + # The second and all subsequent reads should _use_ the cache, not modify it. I looked + # into checking file modification times, but this test will probably be faster than the + # granularity of `os.path.getmtime()` (which only returns values in seconds). + reader.read(data_file) + reader.read(data_file) + reader.read(data_file) + reader.read(data_file) + with open(cache_file, "r") as in_file: + final_cache_contents = in_file.read() + assert cache_contents == final_cache_contents - def text_to_instance(self, index: int): # type: ignore - return Instance({"index": LabelField(index, skip_indexing=True)}) + @pytest.mark.parametrize("lazy", (True, False)) + def test_caching_works_with_lazy_reading(self, caplog, lazy: bool): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + snli_copy_file = str(data_file) + ".copy" + shutil.copyfile(data_file, snli_copy_file) + reader = TextClassificationJsonReader(lazy=lazy, cache_directory=self.cache_directory) + cache_file = reader._get_cache_location_for_file_path(snli_copy_file) + # The call to read() will give us an _iterator_. We'll iterate over it multiple times, + # and the caching behavior should change as we go. + assert not os.path.exists(cache_file) + instances = reader.read(snli_copy_file) -class MockMdsDatasetReader(DatasetReader): - """ - Implements manual distributed sharding (MDS). - """ + # The first iteration will create the cache + first_pass_instances = [] + for instance in instances: + first_pass_instances.append(instance) + assert "Caching instances to temp file" in " ".join([rec.message for rec in caplog.records]) + assert os.path.exists(cache_file) - def __init__(self, **kwargs) -> None: - super().__init__(manual_distributed_sharding=True, **kwargs) + # Now we _remove_ the data file, to be sure we're reading from the cache. + os.remove(snli_copy_file) + caplog.clear() + instances = reader.read(snli_copy_file) + second_pass_instances = [] + for instance in instances: + second_pass_instances.append(instance) + assert "Reading instances from cache" in " ".join([rec.message for rec in caplog.records]) - def _read(self, file_path): - start_index = 0 - step_size = 1 - if common_util.is_distributed(): - start_index += dist.get_rank() - step_size *= dist.get_world_size() - for i in islice(range(TOTAL_INSTANCES), start_index, None, step_size): - yield self.text_to_instance(i) + # We should get the same instances both times. + assert len(first_pass_instances) == len(second_pass_instances) + for instance, cached_instance in zip(first_pass_instances, second_pass_instances): + assert instance.fields == cached_instance.fields - def text_to_instance(self, index: int): # type: ignore - return Instance({"index": LabelField(index, skip_indexing=True)}) + # And just to be super paranoid, in case the second pass somehow bypassed the cache + # because of a bug that's hard to detect, we'll read the + # instances from the cache with a non-lazy iterator and make sure they're the same. + reader = TextClassificationJsonReader(lazy=False, cache_directory=self.cache_directory) + cached_instances = reader.read(snli_copy_file) + assert len(first_pass_instances) == len(cached_instances) + for instance, cached_instance in zip(first_pass_instances, cached_instances): + assert instance.fields == cached_instance.fields + + @pytest.mark.parametrize("lazy", (True, False)) + def test_caching_skipped_when_lock_not_acquired(self, caplog, lazy: bool): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(lazy=lazy, cache_directory=self.cache_directory) + reader.CACHE_FILE_LOCK_TIMEOUT = 1 + cache_file = reader._get_cache_location_for_file_path(str(data_file)) + + with FileLock(cache_file + ".lock"): + # Right now we hold the lock on the cache, so the reader shouldn't + # be able to write to it. It will wait for 1 second (because that's what + # we set the timeout to be), and then just read the instances as normal. + caplog.clear() + instances = list(reader.read(data_file)) + assert "Failed to acquire lock" in caplog.text + assert instances + + # We didn't write to the cache because we couldn't acquire the file lock. + assert not os.path.exists(cache_file) + + # Now we'll write to the cache and then try the same thing again, this + # time making sure that we can still successfully read without the cache + # when the lock can't be acquired. + deque(reader.read(data_file), maxlen=1) + assert os.path.exists(cache_file) + + with FileLock(cache_file + ".lock"): + # Right now we hold the lock on the cache, so the reader shouldn't + # be able to write to it. It will wait for 1 second (because that's what + # we set the timeout to be), and then just read the instances as normal. + caplog.clear() + instances = list(reader.read(data_file)) + assert "Failed to acquire lock" in caplog.text + assert instances + + @pytest.mark.parametrize("lazy", (True, False)) + def test_caching_skipped_with_distributed_training(self, caplog, monkeypatch, lazy): + monkeypatch.setattr(common_util, "is_distributed", lambda: True) + monkeypatch.setattr(dist, "get_rank", lambda: 0) + monkeypatch.setattr(dist, "get_world_size", lambda: 1) + + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(lazy=lazy, cache_directory=self.cache_directory) + cache_file = reader._get_cache_location_for_file_path(str(data_file)) + + deque(reader.read(data_file), maxlen=1) + assert not os.path.exists(cache_file) + assert "Can't cache data instances when there are multiple processes" in caplog.text + + def test_caching_with_lazy_reader_in_multi_process_loader(self): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(lazy=True, cache_directory=self.cache_directory) + deque( + PyTorchDataLoader(reader.read(data_file), collate_fn=mock_collate_fn, num_workers=2), + maxlen=0, + ) + + # We shouldn't write to the cache when the data is being loaded from multiple + # processes. + cache_file = reader._get_cache_location_for_file_path(str(data_file)) + assert not os.path.exists(cache_file) + + # But try again from the main process and we should see the cache file. + instances = list(reader.read(data_file)) + assert instances + assert os.path.exists(cache_file) + + # Reading again from a multi-process loader should read from the cache. + new_instances = list( + PyTorchDataLoader(reader.read(data_file), collate_fn=mock_collate_fn, num_workers=2) + ) + assert len(instances) == len(new_instances) + + @pytest.mark.parametrize("lazy", (True, False)) + def test_max_instances(self, lazy): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(max_instances=2, lazy=lazy) + instances = reader.read(data_file) + instance_count = sum(1 for _ in instances) + assert instance_count == 2 + + @pytest.mark.parametrize("num_workers", (0, 1, 2)) + def test_max_instances_with_multi_process_loader(self, num_workers): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + reader = TextClassificationJsonReader(max_instances=2, lazy=True) + instances = list( + PyTorchDataLoader( + reader.read(data_file), collate_fn=mock_collate_fn, num_workers=num_workers + ) + ) + assert len(instances) == 2 + + @pytest.mark.parametrize("lazy", (True, False)) + def test_cached_max_instances(self, lazy): + data_file = ( + AllenNlpTestCase.FIXTURES_ROOT + / "data" + / "text_classification_json" + / "imdb_corpus.jsonl" + ) + + # If we try reading with max instances, it shouldn't write to the cache. + reader = TextClassificationJsonReader( + cache_directory=self.cache_directory, lazy=lazy, max_instances=2 + ) + instances = list(reader.read(data_file)) + assert len(instances) == 2 + cache_file = reader._get_cache_location_for_file_path(str(data_file)) + assert not os.path.exists(cache_file) -class MockMmpdsDatasetReader(DatasetReader): - """ - Implements manual multi-process and distributed sharding (MMPDS). - """ + # Now reading again with no max_instances specified should create the cache. + reader = TextClassificationJsonReader(cache_directory=self.cache_directory, lazy=lazy) + instances = list(reader.read(data_file)) + assert len(instances) > 2 + assert os.path.exists(cache_file) - def __init__(self, **kwargs) -> None: - super().__init__( - manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs + # The second read should only return two instances, even though it's from the cache. + reader = TextClassificationJsonReader( + cache_directory=self.cache_directory, max_instances=2, lazy=lazy ) + instances = list(reader.read(data_file)) + assert len(instances) == 2 + + +class MockWorkerInfo(NamedTuple): + id: int + num_workers: int + +class MockDatasetReader(DatasetReader): def _read(self, file_path): - for i in self.shard_iterable(range(TOTAL_INSTANCES)): + for i in range(10): yield self.text_to_instance(i) def text_to_instance(self, index: int): # type: ignore @@ -87,94 +289,94 @@ def text_to_instance(self, index: int): # type: ignore @pytest.mark.parametrize( - "world_size, num_workers, max_instances", + "node_rank, world_size, worker_id, num_workers, max_instances, expected_result", [ - (4, 2, None), - (4, 2, 67), - (4, None, None), - (4, None, None), - (None, 2, None), - (None, 2, 67), - (None, None, None), - (None, None, 67), + (None, None, None, None, None, list(range(10))), + (None, None, None, None, 5, list(range(5))), + (None, None, None, None, 12, list(range(10))), + (None, None, 0, 1, None, list(range(10))), + (None, None, 0, 2, None, [0, 2, 4, 6, 8]), + (None, None, 1, 2, None, [1, 3, 5, 7, 9]), + (None, None, 0, 2, 5, [0, 2, 4]), + (None, None, 1, 2, 5, [1, 3]), + (0, 1, None, None, None, list(range(10))), + (0, 2, None, None, None, [0, 2, 4, 6, 8]), + (1, 2, None, None, None, [1, 3, 5, 7, 9]), + (0, 2, None, None, 5, [0, 2, 4]), + (1, 2, None, None, 5, [1, 3]), + (0, 2, 0, 2, None, [0, 4, 8]), + (0, 2, 1, 2, None, [1, 5, 9]), + (1, 2, 0, 2, None, [2, 6]), + (1, 2, 1, 2, None, [3, 7]), + (0, 2, 0, 2, 5, [0, 4]), ], ) -@pytest.mark.parametrize( - "reader_class", - [MockDatasetReader, MockMmpsDatasetReader, MockMdsDatasetReader, MockMmpdsDatasetReader], -) def test_instance_slicing( monkeypatch, - reader_class, + node_rank: Optional[int], world_size: Optional[int], + worker_id: Optional[int], num_workers: Optional[int], max_instances: Optional[int], + expected_result: List[int], ): - """ - Ensure that the intances read by each worker are always unique and the total - adds up to `max_instances`. - """ - results: List[Set[int]] = [] - - minimum_expected_result_size = max_instances or TOTAL_INSTANCES - maximum_expected_result_size = max_instances or TOTAL_INSTANCES - - if world_size is not None and num_workers is not None: - minimum_expected_result_size //= world_size - minimum_expected_result_size //= num_workers - maximum_expected_result_size = minimum_expected_result_size + 1 - for global_rank in range(world_size): - monkeypatch.setattr(common_util, "is_distributed", lambda: True) - monkeypatch.setattr(dist, "get_rank", lambda: global_rank) - monkeypatch.setattr(dist, "get_world_size", lambda: world_size) - for worker_id in range(num_workers): - reader = reader_class(max_instances=max_instances) - reader._set_worker_info(WorkerInfo(num_workers, worker_id)) - result = set( - x["index"].label for x in reader.read("the-path-doesnt-matter") # type: ignore - ) - results.append(result) - elif world_size is not None: - minimum_expected_result_size //= world_size - maximum_expected_result_size = minimum_expected_result_size + 1 - for global_rank in range(world_size): - monkeypatch.setattr(common_util, "is_distributed", lambda: True) - monkeypatch.setattr(dist, "get_rank", lambda: global_rank) - monkeypatch.setattr(dist, "get_world_size", lambda: world_size) - reader = reader_class(max_instances=max_instances) - result = set( - x["index"].label for x in reader.read("the-path-doesnt-matter") # type: ignore - ) - results.append(result) - elif num_workers is not None: - minimum_expected_result_size //= num_workers - maximum_expected_result_size = minimum_expected_result_size + 1 - for worker_id in range(num_workers): - reader = reader_class(max_instances=max_instances) - reader._set_worker_info(WorkerInfo(num_workers, worker_id)) - result = set( - x["index"].label for x in reader.read("the-path-doesnt-matter") # type: ignore - ) - results.append(result) - else: - reader = reader_class(max_instances=max_instances) - result = set( - x["index"].label for x in reader.read("the-path-doesnt-matter") # type: ignore + if node_rank is not None and world_size is not None: + monkeypatch.setattr(common_util, "is_distributed", lambda: True) + monkeypatch.setattr(dist, "get_rank", lambda: node_rank) + monkeypatch.setattr(dist, "get_world_size", lambda: world_size) + + if worker_id is not None and num_workers is not None: + monkeypatch.setattr( + dataset_reader, "get_worker_info", lambda: MockWorkerInfo(worker_id, num_workers) ) - results.append(result) - - # We need to check that all of the result sets are mutually exclusive and that they're - # union has size `max_instances`. - # Checking that they're mutually exclusive is equivalent to checking that the sum - # of the size of each set is equal to the size of the union. - - union: Set[int] = set() - total: int = 0 - for result in results: - union |= result - total += len(result) - # Also make sure the size of the set is within the expected bounds. - assert minimum_expected_result_size <= len(result) - assert len(result) <= maximum_expected_result_size - - assert len(union) == total == (max_instances or TOTAL_INSTANCES) + + reader = MockDatasetReader(max_instances=max_instances) + result = list((x["index"].label for x in reader.read("the-path-doesnt-matter"))) # type: ignore + + assert result == expected_result + + +class BadLazyReader(DatasetReader): + def _read(self, file_path): + return [self.text_to_instance(i) for i in range(10)] + + def text_to_instance(self, index: int): # type: ignore + return Instance({"index": LabelField(index, skip_indexing=True)}) + + +def test_config_error_when_lazy_reader_returns_list(): + reader = BadLazyReader(lazy=True) + with pytest.raises(ConfigurationError, match="must return a generator"): + deque(reader.read("path"), maxlen=0) + + +class BadReaderReadsNothing(DatasetReader): + def _read(self, file_path): + return [] + + def text_to_instance(self, index: int): # type: ignore + return Instance({"index": LabelField(index, skip_indexing=True)}) + + +def test_config_error_when_reader_returns_no_instances(): + reader = BadReaderReadsNothing() + with pytest.raises(ConfigurationError, match="No instances were read"): + deque(reader.read("path"), maxlen=0) + + +class BadReaderForgetsToSetLazy(DatasetReader): + def __init__(self): + pass + + def _read(self, file_path): + for i in range(10): + yield self.text_to_instance(i) + + def text_to_instance(self, index: int): # type: ignore + return Instance({"index": LabelField(index, skip_indexing=True)}) + + +def warning_when_reader_has_no_lazy_set(): + with pytest.warns(UserWarning, match="DatasetReader.lazy is not set"): + reader = BadReaderForgetsToSetLazy() + reader.read("path") diff --git a/tests/data/dataset_readers/dataset_utils/span_utils_test.py b/tests/data/dataset_readers/dataset_utils/span_utils_test.py index a4bf767a07e..e8714c71d17 100644 --- a/tests/data/dataset_readers/dataset_utils/span_utils_test.py +++ b/tests/data/dataset_readers/dataset_utils/span_utils_test.py @@ -3,7 +3,8 @@ from allennlp.common.testing import AllenNlpTestCase from allennlp.data.dataset_readers.dataset_utils import span_utils -from allennlp.data.tokenizers import Token, SpacyTokenizer +from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer +from allennlp.data.tokenizers.token import Token class SpanUtilsTest(AllenNlpTestCase): diff --git a/tests/data/dataset_readers/interleaving_dataset_reader_test.py b/tests/data/dataset_readers/interleaving_dataset_reader_test.py index 5e32138eae0..cdd7de2a3be 100644 --- a/tests/data/dataset_readers/interleaving_dataset_reader_test.py +++ b/tests/data/dataset_readers/interleaving_dataset_reader_test.py @@ -32,11 +32,11 @@ def test_round_robin(self): reader = InterleavingDatasetReader(readers) data_dir = self.FIXTURES_ROOT / "data" - file_path = { - "a": data_dir / "babi.txt", - "b": data_dir / "conll2003.txt", - "c": data_dir / "conll2003.txt", - } + file_path = f"""{{ + "a": "{data_dir / 'babi.txt'}", + "b": "{data_dir / 'conll2003.txt'}", + "c": "{data_dir / 'conll2003.txt'}" + }}""" instances = list(reader.read(file_path)) first_three_keys = {instance.fields["dataset"].metadata for instance in instances[:3]} diff --git a/tests/data/dataset_readers/lazy_dataset_reader_test.py b/tests/data/dataset_readers/lazy_dataset_reader_test.py new file mode 100644 index 00000000000..55ded98d6cf --- /dev/null +++ b/tests/data/dataset_readers/lazy_dataset_reader_test.py @@ -0,0 +1,62 @@ +from typing import Iterable, List + +from allennlp.data.fields import TextField +from allennlp.data.instance import Instance +from allennlp.data.dataset_readers import DatasetReader +from allennlp.data.token_indexers import SingleIdTokenIndexer +from allennlp.data.tokenizers import Token +from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.util import ensure_list + + +class LazyDatasetReader(DatasetReader): + def __init__(self, instances: List[Instance], lazy: bool) -> None: + super().__init__() + self.lazy = lazy + self._instances = instances + self.num_reads = 0 + + def _read(self, _: str) -> Iterable[Instance]: + self.num_reads += 1 + return (instance for instance in self._instances) + + +class TestLazyDatasetReader(AllenNlpTestCase): + def setup_method(self): + super().setup_method() + token_indexer = {"tokens": SingleIdTokenIndexer()} + + field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]], token_indexer) + field2 = TextField( + [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], token_indexer + ) + field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]], token_indexer) + field4 = TextField([Token(t) for t in ["this", "is", "short"]], token_indexer) + self.instances = [ + Instance({"text1": field1, "text2": field2}), + Instance({"text1": field3, "text2": field4}), + ] + + def test_lazy(self): + reader = LazyDatasetReader(self.instances, lazy=True) + assert reader.num_reads == 0 + + instances = reader.read("path/to/file") + + for _ in range(10): + _instances = (i for i in instances) + assert ensure_list(_instances) == self.instances + + assert reader.num_reads == 10 + + def test_non_lazy(self): + reader = LazyDatasetReader(self.instances, lazy=False) + assert reader.num_reads == 0 + + instances = reader.read("path/to/file") + + for _ in range(10): + _instances = (i for i in instances) + assert ensure_list(_instances) == self.instances + + assert reader.num_reads == 1 diff --git a/tests/data/dataset_readers/sequence_tagging_test.py b/tests/data/dataset_readers/sequence_tagging_test.py index 1da3fca977b..23ce6234456 100644 --- a/tests/data/dataset_readers/sequence_tagging_test.py +++ b/tests/data/dataset_readers/sequence_tagging_test.py @@ -1,13 +1,16 @@ +import pytest + from allennlp.data.dataset_readers import SequenceTaggingDatasetReader +from allennlp.common.util import ensure_list from allennlp.common.testing import AllenNlpTestCase class TestSequenceTaggingDatasetReader: - def test_default_format(self): - reader = SequenceTaggingDatasetReader() - instances = list( - reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") - ) + @pytest.mark.parametrize("lazy", (True, False)) + def test_default_format(self, lazy): + reader = SequenceTaggingDatasetReader(lazy=lazy) + instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") + instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields @@ -25,7 +28,8 @@ def test_default_format(self): def test_brown_corpus_format(self): reader = SequenceTaggingDatasetReader(word_tag_delimiter="/") - instances = list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "brown_corpus.txt")) + instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "brown_corpus.txt") + instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields diff --git a/tests/data/dataset_readers/sharded_dataset_reader_test.py b/tests/data/dataset_readers/sharded_dataset_reader_test.py index d1fa329ec28..80bff533c8b 100644 --- a/tests/data/dataset_readers/sharded_dataset_reader_test.py +++ b/tests/data/dataset_readers/sharded_dataset_reader_test.py @@ -1,13 +1,16 @@ +from collections import Counter import glob import os import tarfile -from collections import Counter from typing import Tuple +import pytest + from allennlp.common.testing import AllenNlpTestCase from allennlp.data.dataset_readers import ( SequenceTaggingDatasetReader, ShardedDatasetReader, + DatasetReader, ) from allennlp.data.instance import Instance @@ -22,12 +25,27 @@ def fingerprint(instance: Instance) -> Tuple[str, ...]: return text_tuple + labels_tuple +def test_exception_raised_when_base_reader_implements_sharding(): + class ManuallyShardedBaseReader(DatasetReader): + def __init__(self, **kwargs): + super().__init__(manual_distributed_sharding=True, **kwargs) + + def _read(self, file_path: str): + pass + + def text_to_instance(self, text: str): # type: ignore + pass + + with pytest.raises(ValueError, match="should not implement manual distributed sharding"): + ShardedDatasetReader(ManuallyShardedBaseReader()) + + class TestShardedDatasetReader(AllenNlpTestCase): def setup_method(self) -> None: super().setup_method() # use SequenceTaggingDatasetReader as the base reader - self.base_reader = SequenceTaggingDatasetReader() + self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" # Make 100 copies of the data diff --git a/tests/data/dataset_readers/text_classification_json_test.py b/tests/data/dataset_readers/text_classification_json_test.py index 88d72dc0b4b..4baf5f7c30b 100644 --- a/tests/data/dataset_readers/text_classification_json_test.py +++ b/tests/data/dataset_readers/text_classification_json_test.py @@ -2,21 +2,24 @@ from typing import List from allennlp.data.dataset_readers import TextClassificationJsonReader +from allennlp.common.util import ensure_list from allennlp.common.testing import AllenNlpTestCase from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter from allennlp.common.util import get_spacy_model class TestTextClassificationJsonReader: - def test_set_skip_indexing_true(self): - reader = TextClassificationJsonReader(skip_label_indexing=True) + @pytest.mark.parametrize("lazy", (True, False)) + def test_set_skip_indexing_true(self, lazy): + reader = TextClassificationJsonReader(lazy=lazy, skip_label_indexing=True) ag_path = ( AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "integer_labels.jsonl" ) - instances = list(reader.read(ag_path)) + instances = reader.read(ag_path) + instances = ensure_list(instances) instance1 = {"tokens": ["This", "text", "has", "label", "0"], "label": 0} instance2 = {"tokens": ["This", "text", "has", "label", "1"], "label": 1} @@ -36,18 +39,20 @@ def test_set_skip_indexing_true(self): / "text_classification_json" / "imdb_corpus.jsonl" ) - list(reader.read(ag_path)) + ensure_list(reader.read(ag_path)) assert str(exec_info.value) == "Labels must be integers if skip_label_indexing is True." - def test_read_from_file_ag_news_corpus(self): - reader = TextClassificationJsonReader() + @pytest.mark.parametrize("lazy", (True, False)) + def test_read_from_file_ag_news_corpus(self, lazy): + reader = TextClassificationJsonReader(lazy=lazy) ag_path = ( AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "ag_news_corpus.jsonl" ) - instances = list(reader.read(ag_path)) + instances = reader.read(ag_path) + instances = ensure_list(instances) instance1 = { "tokens": [ @@ -176,15 +181,17 @@ def test_read_from_file_ag_news_corpus(self): assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"] assert fields["label"].label == instance3["label"] - def test_read_from_file_ag_news_corpus_and_truncates_properly(self): - reader = TextClassificationJsonReader(max_sequence_length=5) + @pytest.mark.parametrize("lazy", (True, False)) + def test_read_from_file_ag_news_corpus_and_truncates_properly(self, lazy): + reader = TextClassificationJsonReader(lazy=lazy, max_sequence_length=5) ag_path = ( AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "ag_news_corpus.jsonl" ) - instances = list(reader.read(ag_path)) + instances = reader.read(ag_path) + instances = ensure_list(instances) instance1 = {"tokens": ["Memphis", "Rout", "Still", "Stings", "for"], "label": "2"} instance2 = {"tokens": ["AP", "-", "Eli", "Manning", "has"], "label": "2"} @@ -202,11 +209,12 @@ def test_read_from_file_ag_news_corpus_and_truncates_properly(self): assert fields["label"].label == instance3["label"] @pytest.mark.parametrize("max_sequence_length", (None, 5)) + @pytest.mark.parametrize("lazy", (True, False)) def test_read_from_file_ag_news_corpus_and_segments_sentences_properly( - self, max_sequence_length + self, lazy, max_sequence_length ): reader = TextClassificationJsonReader( - segment_sentences=True, max_sequence_length=max_sequence_length + lazy=lazy, segment_sentences=True, max_sequence_length=max_sequence_length ) ag_path = ( AllenNlpTestCase.FIXTURES_ROOT @@ -214,7 +222,8 @@ def test_read_from_file_ag_news_corpus_and_segments_sentences_properly( / "text_classification_json" / "ag_news_corpus.jsonl" ) - instances = list(reader.read(ag_path)) + instances = reader.read(ag_path) + instances = ensure_list(instances) splitter = SpacySentenceSplitter() spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False, False) diff --git a/tests/data/fields/array_field_test.py b/tests/data/fields/array_field_test.py new file mode 100644 index 00000000000..fbb0eb7da84 --- /dev/null +++ b/tests/data/fields/array_field_test.py @@ -0,0 +1,115 @@ +import numpy +import torch + +from allennlp.common.testing.test_case import AllenNlpTestCase +from allennlp.data.fields import ArrayField, ListField + + +class TestArrayField(AllenNlpTestCase): + def test_get_padding_lengths_correctly_returns_ordered_shape(self): + shape = [3, 4, 5, 6] + array = numpy.zeros(shape) + array_field = ArrayField(array) + lengths = array_field.get_padding_lengths() + for i in range(len(lengths)): + assert lengths["dimension_{}".format(i)] == shape[i] + + def test_as_tensor_handles_larger_padding_dimensions(self): + shape = [3, 4] + array = numpy.ones(shape) + array_field = ArrayField(array) + + padded_tensor = ( + array_field.as_tensor({"dimension_0": 5, "dimension_1": 6}).detach().cpu().numpy() + ) + numpy.testing.assert_array_equal(padded_tensor[:3, :4], array) + numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.0) + + def test_padding_handles_list_fields(self): + array1 = ArrayField(numpy.ones([2, 3])) + array2 = ArrayField(numpy.ones([1, 5])) + empty_array = array1.empty_field() + list_field = ListField([array1, array2, empty_array]) + + returned_tensor = ( + list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() + ) + correct_tensor = numpy.array( + [ + [[1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0]], + [[1.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0]], + ] + ) + numpy.testing.assert_array_equal(returned_tensor, correct_tensor) + + def test_padding_handles_list_fields_with_padding_values(self): + array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1) + array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1) + empty_array = array1.empty_field() + list_field = ListField([array1, array2, empty_array]) + + returned_tensor = ( + list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() + ) + correct_tensor = numpy.array( + [ + [[1.0, 1.0, 1.0, -1.0, -1.0], [1.0, 1.0, 1.0, -1.0, -1.0]], + [[1.0, 1.0, 1.0, 1.0, 1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]], + [[-1.0, -1.0, -1.0, -1.0, -1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]], + ] + ) + numpy.testing.assert_array_equal(returned_tensor, correct_tensor) + + def test_printing_doesnt_crash(self): + array = ArrayField(numpy.ones([2, 3]), padding_value=-1) + print(array) + + def test_as_tensor_works_with_scalar(self): + array = ArrayField(numpy.asarray(42)) + returned_tensor = array.as_tensor(array.get_padding_lengths()) + current_tensor = numpy.asarray(42) + numpy.testing.assert_array_equal(returned_tensor, current_tensor) + + def test_as_tensor_with_scalar_keeps_dtype(self): + array = ArrayField(numpy.asarray(42, dtype=numpy.float32)) + returned_tensor = array.as_tensor(array.get_padding_lengths()) + assert returned_tensor.dtype == torch.float32 + + def test_alternative_dtypes(self): + shape = [3, 4, 5, 6] + array = numpy.zeros(shape) + + # Setting dtype to numpy.int64 should produce a torch.LongTensor when field is converted to + # a tensor + array_field1 = ArrayField(array, dtype=numpy.int64) + returned_tensor1 = array_field1.as_tensor(array_field1.get_padding_lengths()) + assert returned_tensor1.dtype == torch.int64 + + # Setting dtype to numpy.uint8 should produce a torch.ByteTensor when field is converted to + # a tensor + array_field2 = ArrayField(array, dtype=numpy.uint8) + returned_tensor2 = array_field2.as_tensor(array_field2.get_padding_lengths()) + assert returned_tensor2.dtype == torch.uint8 + + # Padding should not affect dtype + padding_lengths = {"dimension_" + str(i): 10 for i, _ in enumerate(shape)} + padded_tensor = array_field2.as_tensor(padding_lengths) + assert padded_tensor.dtype == torch.uint8 + + # Empty fields should have the same dtype + empty_field = array_field2.empty_field() + assert empty_field.dtype == array_field2.dtype + + def test_len_works_with_scalar(self): + array = ArrayField(numpy.asarray(42)) + assert len(array) == 1 + + def test_eq(self): + array1 = ArrayField(numpy.asarray([1, 1, 1])) + array2 = ArrayField(numpy.asarray([[1, 1, 1], [1, 1, 1]])) + array3 = ArrayField(numpy.asarray([1, 1, 2])) + array4 = ArrayField(numpy.asarray([1, 1, 1])) + assert array1 != array2 + assert array1 != array3 + assert array1 == array4 diff --git a/tests/data/fields/list_field_test.py b/tests/data/fields/list_field_test.py index cdf2ad97d87..2356d9b3646 100644 --- a/tests/data/fields/list_field_test.py +++ b/tests/data/fields/list_field_test.py @@ -7,7 +7,8 @@ from allennlp.data import Token, Vocabulary, Instance from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer -from allennlp.data.data_loaders import SimpleDataLoader +from allennlp.data.dataloader import PyTorchDataLoader +from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset from allennlp.data.tokenizers import SpacyTokenizer from allennlp.models import Model from allennlp.modules import Embedding @@ -296,10 +297,11 @@ def test_empty_list_can_be_tensorized(self): instance.as_tensor_dict() def test_batch_with_some_empty_lists_works(self): - instances = [self.empty_instance, self.non_empty_instance] + dataset = AllennlpDataset([self.empty_instance, self.non_empty_instance], self.vocab) + model = DummyModel(self.vocab) model.eval() - loader = SimpleDataLoader(instances, 2, vocab=self.vocab) + loader = PyTorchDataLoader(dataset, batch_size=2) batch = next(iter(loader)) model.forward(**batch) @@ -310,10 +312,11 @@ def test_batch_with_some_empty_lists_works(self): # makes a whole lot more sense to just have a minimally-sized tensor that # gets entirely masked and has no effect on the rest of the model. def test_batch_of_entirely_empty_lists_works(self): - instances = [self.empty_instance, self.empty_instance] + dataset = AllennlpDataset([self.empty_instance, self.empty_instance], self.vocab) + model = DummyModel(self.vocab) model.eval() - loader = SimpleDataLoader(instances, 2, vocab=self.vocab) + loader = PyTorchDataLoader(dataset, batch_size=2) batch = next(iter(loader)) model.forward(**batch) diff --git a/tests/data/samplers/bucket_batch_sampler_test.py b/tests/data/samplers/bucket_batch_sampler_test.py index 3a972facdc2..dc71aa2efaa 100644 --- a/tests/data/samplers/bucket_batch_sampler_test.py +++ b/tests/data/samplers/bucket_batch_sampler_test.py @@ -1,18 +1,21 @@ from allennlp.common import Params -from allennlp.data import Instance, Token, Batch +from allennlp.data import Instance, Token +from allennlp.data.batch import Batch from allennlp.data.fields import TextField from allennlp.data.samplers import BucketBatchSampler -from allennlp.data.data_loaders import MultiProcessDataLoader +from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset +from allennlp.data.dataloader import PyTorchDataLoader from .sampler_test import SamplerTest class TestBucketSampler(SamplerTest): def test_create_batches_groups_correctly(self): - sampler = BucketBatchSampler(batch_size=2, padding_noise=0, sorting_keys=["text"]) + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"]) grouped_instances = [] - for indices in sampler.get_batch_indices(self.instances): + for indices in sampler: grouped_instances.append([self.instances[idx] for idx in indices]) expected_groups = [ [self.instances[4], self.instances[2]], @@ -25,7 +28,8 @@ def test_create_batches_groups_correctly(self): assert expected_groups == [] def test_guess_sorting_key_picks_the_longest_key(self): - sampler = BucketBatchSampler(batch_size=2, padding_noise=0) + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0) instances = [] short_tokens = [Token(t) for t in ["what", "is", "this", "?"]] long_tokens = [Token(t) for t in ["this", "is", "a", "not", "very", "long", "passage"]] @@ -58,12 +62,13 @@ def test_guess_sorting_key_picks_the_longest_key(self): assert sampler.sorting_keys == ["passage"] def test_from_params(self): + dataset = AllennlpDataset(self.instances, self.vocab) params = Params({}) sorting_keys = ["s1", "s2"] params["sorting_keys"] = sorting_keys params["batch_size"] = 32 - sampler = BucketBatchSampler.from_params(params=params) + sampler = BucketBatchSampler.from_params(params=params, data_source=dataset) assert sampler.sorting_keys == sorting_keys assert sampler.padding_noise == 0.1 @@ -78,33 +83,27 @@ def test_from_params(self): } ) - sampler = BucketBatchSampler.from_params(params=params) + sampler = BucketBatchSampler.from_params(params=params, data_source=dataset) assert sampler.sorting_keys == sorting_keys assert sampler.padding_noise == 0.5 assert sampler.batch_size == 100 assert sampler.drop_last def test_drop_last_works(self): + dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler( + dataset, batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) - # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. - def collate_fn(x, **kwargs): - return Batch(x) - - data_loader = MultiProcessDataLoader( - self.get_mock_reader(), - "fake_path", - batch_sampler=sampler, + dataloader = PyTorchDataLoader( + dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x) ) - data_loader.collate_fn = collate_fn - data_loader.index_with(self.vocab) - batches = [batch for batch in iter(data_loader)] + batches = [batch for batch in iter(dataloader)] stats = self.get_batches_stats(batches) # all batches have length batch_size @@ -114,21 +113,29 @@ def collate_fn(x, **kwargs): assert stats["total_instances"] == len(self.instances) - 1 def test_batch_count(self): - sampler = BucketBatchSampler(batch_size=2, padding_noise=0, sorting_keys=["text"]) - data_loader = MultiProcessDataLoader( - self.get_mock_reader(), "fake_path", batch_sampler=sampler + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"]) + # We use a custom collate_fn for testing, which doesn't actually create tensors, + # just the allennlp Batches. + dataloader = PyTorchDataLoader( + dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x) ) - data_loader.index_with(self.vocab) - assert len(data_loader) == 3 + + assert len(dataloader) == 3 def test_batch_count_with_drop_last(self): + dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler( + dataset, batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) - data_loader = MultiProcessDataLoader( - self.get_mock_reader(), "fake_path", batch_sampler=sampler + # We use a custom collate_fn for testing, which doesn't actually create tensors, + # just the allennlp Batches. + dataloader = PyTorchDataLoader( + dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x) ) - assert len(data_loader) == 2 + + assert len(dataloader) == 2 diff --git a/tests/data/samplers/max_tokens_batch_sampler_test.py b/tests/data/samplers/max_tokens_batch_sampler_test.py index a3b7e094733..04e5c87ca6c 100644 --- a/tests/data/samplers/max_tokens_batch_sampler_test.py +++ b/tests/data/samplers/max_tokens_batch_sampler_test.py @@ -1,17 +1,23 @@ +from allennlp.common import Params from allennlp.data import Instance, Token +from allennlp.data.batch import Batch from allennlp.data.fields import TextField from allennlp.data.samplers import MaxTokensBatchSampler -from allennlp.data.data_loaders import MultiProcessDataLoader +from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset +from allennlp.data.dataloader import PyTorchDataLoader from .sampler_test import SamplerTest class TestMaxTokensSampler(SamplerTest): def test_create_batches_groups_correctly(self): - sampler = MaxTokensBatchSampler(max_tokens=8, padding_noise=0, sorting_keys=["text"]) + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = MaxTokensBatchSampler( + dataset, max_tokens=8, padding_noise=0, sorting_keys=["text"] + ) grouped_instances = [] - for indices in sampler.get_batch_indices(self.instances): + for indices in sampler: grouped_instances.append([self.instances[idx] for idx in indices]) expected_groups = [ [self.instances[4], self.instances[2]], @@ -24,7 +30,8 @@ def test_create_batches_groups_correctly(self): assert expected_groups == [] def test_guess_sorting_key_picks_the_longest_key(self): - sampler = MaxTokensBatchSampler(max_tokens=8, padding_noise=0) + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = MaxTokensBatchSampler(dataset, max_tokens=8, padding_noise=0) instances = [] short_tokens = [Token(t) for t in ["what", "is", "this", "?"]] long_tokens = [Token(t) for t in ["this", "is", "a", "not", "very", "long", "passage"]] @@ -56,9 +63,35 @@ def test_guess_sorting_key_picks_the_longest_key(self): sampler._guess_sorting_keys(instances) assert sampler.sorting_keys == ["passage"] + def test_from_params(self): + dataset = AllennlpDataset(self.instances, self.vocab) + params = Params({}) + + sorting_keys = ["s1", "s2"] + params["sorting_keys"] = sorting_keys + params["max_tokens"] = 32 + sampler = MaxTokensBatchSampler.from_params(params=params, data_source=dataset) + + assert sampler.sorting_keys == sorting_keys + assert sampler.padding_noise == 0.1 + assert sampler.max_tokens == 32 + + params = Params({"sorting_keys": sorting_keys, "padding_noise": 0.5, "max_tokens": 100}) + + sampler = MaxTokensBatchSampler.from_params(params=params, data_source=dataset) + assert sampler.sorting_keys == sorting_keys + assert sampler.padding_noise == 0.5 + assert sampler.max_tokens == 100 + def test_batch_count(self): - sampler = MaxTokensBatchSampler(max_tokens=8, padding_noise=0, sorting_keys=["text"]) - data_loader = MultiProcessDataLoader( - self.get_mock_reader(), "fake_path", batch_sampler=sampler + dataset = AllennlpDataset(self.instances, vocab=self.vocab) + sampler = MaxTokensBatchSampler( + dataset, max_tokens=8, padding_noise=0, sorting_keys=["text"] ) - assert len(data_loader) == 3 + # We use a custom collate_fn for testing, which doesn't actually create tensors, + # just the allennlp Batches. + dataloader = PyTorchDataLoader( + dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x) + ) + + assert len(dataloader) == 3 diff --git a/tests/data/samplers/sampler_test.py b/tests/data/samplers/sampler_test.py index 3be895f8657..e981d41ebec 100644 --- a/tests/data/samplers/sampler_test.py +++ b/tests/data/samplers/sampler_test.py @@ -1,7 +1,7 @@ from typing import List, Iterable, Dict, Union from allennlp.common.testing import AllenNlpTestCase -from allennlp.data import Vocabulary, Instance, Token, Batch, DatasetReader +from allennlp.data import Vocabulary, Instance, Token, Batch from allennlp.data.fields import TextField from allennlp.data.token_indexers import SingleIdTokenIndexer @@ -40,22 +40,9 @@ def setup_method(self): self.instances = instances self.lazy_instances = LazyIterable(instances) - def get_mock_reader(self) -> DatasetReader: - class MockReader(DatasetReader): - def __init__(self, instances, **kwargs): - super().__init__(**kwargs) - self.instances = instances - - def _read(self, file_path: str): - for instance in self.instances: - yield instance - - return MockReader(self.instances) - def create_instance(self, str_tokens: List[str]): tokens = [Token(t) for t in str_tokens] instance = Instance({"text": TextField(tokens, self.token_indexers)}) - instance.index_fields(self.vocab) return instance def create_instances_from_token_counts(self, token_counts: List[int]) -> List[Instance]: diff --git a/tests/data/token_indexers/pretrained_transformer_indexer_test.py b/tests/data/token_indexers/pretrained_transformer_indexer_test.py index d817af9b392..f15f6096a36 100644 --- a/tests/data/token_indexers/pretrained_transformer_indexer_test.py +++ b/tests/data/token_indexers/pretrained_transformer_indexer_test.py @@ -99,7 +99,7 @@ def test_transformers_vocab_sizes(self, model_name): def test_transformers_vocabs_added_correctly(self): namespace, model_name = "tags", "roberta-base" - tokenizer = cached_transformers.get_tokenizer(model_name, use_fast=False) + tokenizer = cached_transformers.get_tokenizer(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") diff --git a/tests/data/tokenizers/letters_digits_tokenizer_test.py b/tests/data/tokenizers/letters_digits_tokenizer_test.py index 10309a8355d..07673c07b35 100644 --- a/tests/data/tokenizers/letters_digits_tokenizer_test.py +++ b/tests/data/tokenizers/letters_digits_tokenizer_test.py @@ -1,5 +1,6 @@ from allennlp.common.testing import AllenNlpTestCase -from allennlp.data.tokenizers import Token, LettersDigitsTokenizer +from allennlp.data.tokenizers.letters_digits_tokenizer import LettersDigitsTokenizer +from allennlp.data.tokenizers.token import Token class TestLettersDigitsTokenizer(AllenNlpTestCase): diff --git a/tests/data/tokenizers/spacy_tokenizer_test.py b/tests/data/tokenizers/spacy_tokenizer_test.py index 87756ce2d4b..5f445453b69 100644 --- a/tests/data/tokenizers/spacy_tokenizer_test.py +++ b/tests/data/tokenizers/spacy_tokenizer_test.py @@ -1,7 +1,8 @@ import spacy from allennlp.common.testing import AllenNlpTestCase -from allennlp.data.tokenizers import Token, SpacyTokenizer +from allennlp.data.tokenizers.token import Token +from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer class TestSpacyTokenizer(AllenNlpTestCase): diff --git a/tests/data/vocabulary_test.py b/tests/data/vocabulary_test.py index 69b90c718f1..91d9c0e3021 100644 --- a/tests/data/vocabulary_test.py +++ b/tests/data/vocabulary_test.py @@ -873,19 +873,3 @@ def test_from_files_with_model_archive(self): vocab = Vocabulary.from_files(str(self.model_archive)) vocab.get_namespaces() == {"tokens", "labels"} assert vocab.get_token_from_index(3, namespace="tokens") == "u.n." - - -class TestVocabularyFromPretrainedTransformer(AllenNlpTestCase): - @pytest.mark.parametrize("model_name", ["bert-base-cased", "roberta-base"]) - def test_from_pretrained_transformer(self, model_name): - namespace = "tokens" - from allennlp.common import cached_transformers - - tokenizer = cached_transformers.get_tokenizer(model_name) - - vocab = Vocabulary.from_pretrained_transformer(model_name, namespace=namespace) - assert vocab._token_to_index[namespace] == tokenizer.get_vocab() - vocab.save_to_files(self.TEST_DIR / "vocab") - - vocab1 = Vocabulary.from_files(self.TEST_DIR / "vocab") - assert vocab1._token_to_index[namespace] == tokenizer.get_vocab() diff --git a/tests/models/archival_test.py b/tests/models/archival_test.py index 9e7ed2fee31..3b135536622 100644 --- a/tests/models/archival_test.py +++ b/tests/models/archival_test.py @@ -43,7 +43,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), @@ -90,7 +95,8 @@ def test_archive_model_uses_archive_path(self): train_model(self.params, serialization_dir=serialization_dir) # Use a new path. archive_model( - serialization_dir=serialization_dir, archive_path=serialization_dir / "new_path.tar.gz" + serialization_dir=serialization_dir, + archive_path=serialization_dir / "new_path.tar.gz", ) archive = load_archive(serialization_dir / "new_path.tar.gz") assert archive diff --git a/tests/modules/attention/scaled_dot_product_attention_test.py b/tests/modules/attention/scaled_dot_product_attention_test.py index 247cafc200d..6dd9314691b 100644 --- a/tests/modules/attention/scaled_dot_product_attention_test.py +++ b/tests/modules/attention/scaled_dot_product_attention_test.py @@ -5,7 +5,9 @@ from allennlp.common import Params from allennlp.common.testing.test_case import AllenNlpTestCase from allennlp.modules.attention.attention import Attention -from allennlp.modules.attention.scaled_dot_product_attention import ScaledDotProductAttention +from allennlp.modules.attention.scaled_dot_product_attention import ( + ScaledDotProductAttention, +) class TestScaledDotProductAttention(AllenNlpTestCase): diff --git a/tests/modules/elmo_test.py b/tests/modules/elmo_test.py index f885f6f39ec..77dfa797fa2 100644 --- a/tests/modules/elmo_test.py +++ b/tests/modules/elmo_test.py @@ -12,7 +12,8 @@ from allennlp.data.fields import TextField from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer -from allennlp.data.data_loaders import SimpleDataLoader +from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset +from allennlp.data.dataloader import PyTorchDataLoader from allennlp.modules.elmo import _ElmoBiLm, _ElmoCharacterEncoder, Elmo from allennlp.modules.token_embedders import ElmoTokenEmbedder from allennlp.nn.util import remove_sentence_boundaries @@ -99,9 +100,9 @@ def test_elmo_bilm(self): instances.append(instance) vocab = Vocabulary() + dataset = AllennlpDataset(instances, vocab) # Now finally we can iterate through batches. - loader = SimpleDataLoader(instances, 3) - loader.index_with(vocab) + loader = PyTorchDataLoader(dataset, 3) for i, batch in enumerate(loader): lm_embeddings = elmo_bilm(batch["elmo"]["character_ids"]["elmo_tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( diff --git a/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py b/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py index 4dcb3db98d8..6a6eabe48e6 100644 --- a/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py +++ b/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py @@ -32,7 +32,7 @@ def test_positional_embeddings(positional_encoding: Optional[str]): @pytest.mark.parametrize("positional_encoding", [None, "sinusoidal", "embedding"]) -def test_positional_encodings(positional_encoding: Optional[str]): +def test_mask_works(positional_encoding: Optional[str]): # All sizes are prime, making them easy to find during debugging. batch_size = 3 max_seq_len = 11 @@ -44,35 +44,28 @@ def test_positional_encodings(positional_encoding: Optional[str]): transformer.eval() with torch.no_grad(): - # We test this by running it twice, once with a shuffled sequence. The results should be the same if there - # is no positional encoding, and different otherwise. + # Construct inputs and masks inputs = torch.randn(batch_size, max_seq_len, dims) - mask = torch.ones(batch_size, max_seq_len, dtype=torch.bool) + all_ones_mask = torch.ones(batch_size, max_seq_len, dtype=torch.bool) + mask = all_ones_mask.clone() for b in range(batch_size): mask[b, max_seq_len - b :] = False - unshuffled_output = transformer(inputs, mask) + altered_inputs = inputs + (~mask).unsqueeze(2) * 10.0 - shuffle = torch.arange(0, max_seq_len).unsqueeze(0).expand_as(mask).clone() - for b in range(batch_size): - # Take care not to shuffle the masked values - perm = torch.randperm(max_seq_len - b) - shuffle[b, : max_seq_len - b] = shuffle[b, perm] - shuffle = shuffle.unsqueeze(2).expand_as(inputs) - shuffled_input = torch.gather(inputs, 1, shuffle) - shuffled_output = transformer(shuffled_input, mask) + # Make sure there is a difference without the mask + assert not torch.allclose( + transformer(inputs, all_ones_mask), transformer(altered_inputs, all_ones_mask) + ) - if positional_encoding is None: - assert torch.allclose( - torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-5 - ) - else: - assert not torch.allclose( - torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-5 - ) + # Make sure there is no difference with the mask + assert torch.allclose( + torch.masked_select(transformer(inputs, mask), mask.unsqueeze(2)), + torch.masked_select(transformer(altered_inputs, mask), mask.unsqueeze(2)), + ) @pytest.mark.parametrize("positional_encoding", [None, "sinusoidal", "embedding"]) -def test_mask_works(positional_encoding: Optional[str]): +def test_positional_encodings(positional_encoding: Optional[str]): # All sizes are prime, making them easy to find during debugging. batch_size = 3 max_seq_len = 11 @@ -84,21 +77,28 @@ def test_mask_works(positional_encoding: Optional[str]): transformer.eval() with torch.no_grad(): - # Construct inputs and masks + # We test this by running it twice, once with a shuffled sequence. The results should be the same if there + # is no positional encoding, and different otherwise. inputs = torch.randn(batch_size, max_seq_len, dims) - all_ones_mask = torch.ones(batch_size, max_seq_len, dtype=torch.bool) - mask = all_ones_mask.clone() + mask = torch.ones(batch_size, max_seq_len, dtype=torch.bool) for b in range(batch_size): mask[b, max_seq_len - b :] = False - altered_inputs = inputs + (~mask).unsqueeze(2) * 10.0 + unshuffled_output = transformer(inputs, mask) - # Make sure there is a difference without the mask - assert not torch.allclose( - transformer(inputs, all_ones_mask), transformer(altered_inputs, all_ones_mask) - ) + shuffle = torch.arange(0, max_seq_len).unsqueeze(0).expand_as(mask).clone() + for b in range(batch_size): + # Take care not to shuffle the masked values + perm = torch.randperm(max_seq_len - b) + shuffle[b, : max_seq_len - b] = shuffle[b, perm] + shuffle = shuffle.unsqueeze(2).expand_as(inputs) + shuffled_input = torch.gather(inputs, 1, shuffle) + shuffled_output = transformer(shuffled_input, mask) - # Make sure there is no difference with the mask - assert torch.allclose( - torch.masked_select(transformer(inputs, mask), mask.unsqueeze(2)), - torch.masked_select(transformer(altered_inputs, mask), mask.unsqueeze(2)), - ) + if positional_encoding is None: + assert torch.allclose( + torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-7 + ) + else: + assert not torch.allclose( + torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-7 + ) diff --git a/tests/modules/token_embedders/pretrained_transformer_embedder_test.py b/tests/modules/token_embedders/pretrained_transformer_embedder_test.py index 0b454c84db5..72233c944ee 100644 --- a/tests/modules/token_embedders/pretrained_transformer_embedder_test.py +++ b/tests/modules/token_embedders/pretrained_transformer_embedder_test.py @@ -315,18 +315,3 @@ def test_encoder_decoder_model(self): token_ids = torch.LongTensor([[1, 2, 3], [2, 3, 4]]) mask = torch.ones_like(token_ids).bool() token_embedder(token_ids, mask) - - def test_embeddings_resize(self): - regular_token_embedder = PretrainedTransformerEmbedder("bert-base-cased") - assert ( - regular_token_embedder.transformer_model.embeddings.word_embeddings.num_embeddings - == 28996 - ) - tokenizer_kwargs = {"additional_special_tokens": [""]} - enhanced_token_embedder = PretrainedTransformerEmbedder( - "bert-base-cased", tokenizer_kwargs=tokenizer_kwargs - ) - assert ( - enhanced_token_embedder.transformer_model.embeddings.word_embeddings.num_embeddings - == 28997 - ) diff --git a/tests/modules/transformer/toolkit_test.py b/tests/modules/transformer/toolkit_test.py index cd1bf60e9fd..df995d1f076 100644 --- a/tests/modules/transformer/toolkit_test.py +++ b/tests/modules/transformer/toolkit_test.py @@ -112,16 +112,24 @@ def forward( medium_layers = dict(medium.combined_transformer.layers.named_modules()) assert_equal_parameters( - medium_layers["0"], pretrained_layers["8"], TransformerStack._huggingface_mapping + medium_layers["0"], + pretrained_layers["8"], + TransformerStack._huggingface_mapping, ) assert_equal_parameters( - medium_layers["1"], pretrained_layers["9"], TransformerStack._huggingface_mapping + medium_layers["1"], + pretrained_layers["9"], + TransformerStack._huggingface_mapping, ) assert_equal_parameters( - medium_layers["2"], pretrained_layers["10"], TransformerStack._huggingface_mapping + medium_layers["2"], + pretrained_layers["10"], + TransformerStack._huggingface_mapping, ) assert_equal_parameters( - medium_layers["3"], pretrained_layers["11"], TransformerStack._huggingface_mapping + medium_layers["3"], + pretrained_layers["11"], + TransformerStack._huggingface_mapping, ) def test_combination_of_two_different_berts(self): diff --git a/tests/modules/transformer/transformer_embeddings_test.py b/tests/modules/transformer/transformer_embeddings_test.py index 08212ee15c9..9e267d3a9cf 100644 --- a/tests/modules/transformer/transformer_embeddings_test.py +++ b/tests/modules/transformer/transformer_embeddings_test.py @@ -124,7 +124,11 @@ def __init__( self.dropout = torch.nn.Dropout(dropout) def forward( - self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None + self, + input_ids=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, ): if input_ids is not None: input_shape = input_ids.size() @@ -168,7 +172,9 @@ def test_forward_runs_with_inputs(self): token_type_ids = torch.tensor([[1, 0]], dtype=torch.long) position_ids = torch.tensor([[0, 1]]) self.transformer_embeddings.forward( - input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, ) def test_output_size(self): @@ -180,7 +186,9 @@ def test_output_size(self): params = Params(params) module = TransformerEmbeddings.from_params(params) output = module.forward( - input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, ) assert output.shape[-1] == 7 @@ -224,13 +232,17 @@ def test_forward_against_huggingface_output(self, module_name, hf_module): torch.manual_seed(1234) embeddings = embeddings.eval() # setting to eval mode to avoid non-deterministic dropout. output = embeddings.forward( - input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, ) torch.manual_seed(1234) hf_module = hf_module.eval() # setting to eval mode to avoid non-deterministic dropout. hf_output = hf_module.forward( - input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, ) assert torch.allclose(output, hf_output) diff --git a/tests/nn/util_test.py b/tests/nn/util_test.py index d98439534ff..705f3f7ab74 100644 --- a/tests/nn/util_test.py +++ b/tests/nn/util_test.py @@ -1427,6 +1427,25 @@ def test_combine_tensors_and_multiply_with_batch_size_one_and_seq_len_one(self): assert_almost_equal(result.size(), [1, seq_len_1, seq_len_2]) + def test_has_tensor(self): + + has_tensor = util.has_tensor + tensor = torch.tensor([1, 2, 3]) + + assert has_tensor(["a", 10, tensor]) + assert not has_tensor(["a", 10]) + + assert has_tensor(("a", 10, tensor)) + assert not has_tensor(("a", 10)) + + assert has_tensor({"a": tensor, "b": 1}) + assert not has_tensor({"a": 10, "b": 1}) + + assert has_tensor(tensor) + assert not has_tensor(3) + + assert has_tensor({"x": [0, {"inside": {"double_inside": [3, [10, tensor]]}}]}) + def test_combine_initial_dims(self): tensor = torch.randn(4, 10, 20, 17, 5) @@ -1452,13 +1471,13 @@ def test_inspect_model_parameters(self): assert parameters_inspection_dict == util.inspect_parameters(model) def test_move_to_device(self): - # We're faking the tensor here so that we can test the calls to .to() without actually + # We're faking the tensor here so that we can test the calls to .cuda() without actually # needing a GPU. class FakeTensor(torch.Tensor): def __init__(self): self._device = None - def to(self, device, **kwargs): + def cuda(self, device): self._device = device return self diff --git a/tests/training/learning_rate_schedulers/slanted_triangular_test.py b/tests/training/learning_rate_schedulers/slanted_triangular_test.py index 5280970a34a..fadd7582186 100644 --- a/tests/training/learning_rate_schedulers/slanted_triangular_test.py +++ b/tests/training/learning_rate_schedulers/slanted_triangular_test.py @@ -5,10 +5,11 @@ import torch import pytest +from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset from allennlp.common import Lazy, Params from allennlp.common.checks import ConfigurationError from allennlp.common.testing import AllenNlpTestCase -from allennlp.data.data_loaders import SimpleDataLoader +from allennlp.data import PyTorchDataLoader from allennlp.training import Trainer from allennlp.training.learning_rate_schedulers import LearningRateScheduler, SlantedTriangular from allennlp.training.optimizers import Optimizer @@ -113,14 +114,14 @@ def test_from_params_in_trainer(self): ) # The method called in the logic below only checks the length of this list, not its # contents, so this should be safe. - instances = [1] * 40 + instances = AllennlpDataset([1] * 40) optim = self._get_optimizer() trainer = Trainer.from_params( model=self.model, optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, - data_loader=SimpleDataLoader(instances, batch_size=10), + data_loader=PyTorchDataLoader(instances, batch_size=10), ) assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular) @@ -150,7 +151,7 @@ def test_from_params_in_trainer(self): optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, - data_loader=SimpleDataLoader(instances, batch_size=10), + data_loader=PyTorchDataLoader(instances, batch_size=10), ) assert trainer._learning_rate_scheduler.num_epochs == 3 diff --git a/tests/training/optimizer_test.py b/tests/training/optimizer_test.py index b396cdcd4cc..1c330d5d718 100644 --- a/tests/training/optimizer_test.py +++ b/tests/training/optimizer_test.py @@ -20,7 +20,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, } ) self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params) @@ -90,10 +95,19 @@ def setup_method(self): { "text_field_embedder": { "token_embedders": { - "tokens": {"type": "embedding", "embedding_dim": 5, "sparse": True} + "tokens": { + "type": "embedding", + "embedding_dim": 5, + "sparse": True, + } } }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) diff --git a/tests/training/trainer_test.py b/tests/training/trainer_test.py index a20bc9fba4c..be37ebc0e5c 100644 --- a/tests/training/trainer_test.py +++ b/tests/training/trainer_test.py @@ -16,7 +16,11 @@ from allennlp.common.params import Params from allennlp.common.testing import AllenNlpTestCase, requires_gpu, requires_multi_gpu from allennlp.data import Vocabulary -from allennlp.data.data_loaders import MultiProcessDataLoader, SimpleDataLoader, TensorDict +from allennlp.data.data_loaders import ( + MultiProcessDataLoader, + SimpleDataLoader, + TensorDict, +) from allennlp.data.dataset_readers import SequenceTaggingDatasetReader from allennlp.models.model import Model from allennlp.models.simple_tagger import SimpleTagger @@ -52,7 +56,12 @@ def setup_method(self): "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, - "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, + "encoder": { + "type": "lstm", + "input_size": 5, + "hidden_size": 7, + "num_layers": 2, + }, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) @@ -559,7 +568,9 @@ def test_trainer_can_run_with_lr_scheduler(self): trainer.train() def test_trainer_sends_metric_to_lr_scheduler(self): - from allennlp.training.learning_rate_schedulers import ReduceOnPlateauLearningRateScheduler + from allennlp.training.learning_rate_schedulers import ( + ReduceOnPlateauLearningRateScheduler, + ) class RecordMetricLearningRateScheduler(ReduceOnPlateauLearningRateScheduler): def __init__(self, optimizer: Optimizer): @@ -960,7 +971,10 @@ def test_track_epoch_callback(self): def test_trainer_callback_is_called_everywhere(self): class FakeTrainerCallback(TrainerCallback): def on_start( - self, trainer: "GradientDescentTrainer", is_primary: bool = True, **kwargs + self, + trainer: "GradientDescentTrainer", + is_primary: bool = True, + **kwargs, ) -> None: if not hasattr(trainer, "start_callback_is_fired_first"): trainer.start_callback_is_fired_first = True # type: ignore