From 299c1bca4c27dc16544820569a57ab27565ecc73 Mon Sep 17 00:00:00 2001 From: Dusan Varis Date: Wed, 9 Jan 2019 16:29:51 +0100 Subject: [PATCH 1/3] workaround for train_set batching during inference time --- neuralmonkey/learning_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/neuralmonkey/learning_utils.py b/neuralmonkey/learning_utils.py index 50e0e0711..102eafb42 100644 --- a/neuralmonkey/learning_utils.py +++ b/neuralmonkey/learning_utils.py @@ -13,7 +13,7 @@ from termcolor import colored from neuralmonkey.logging import log, log_print, warn -from neuralmonkey.dataset import Dataset +from neuralmonkey.dataset import Dataset, BatchingScheme from neuralmonkey.tf_manager import TensorFlowManager from neuralmonkey.runners.base_runner import ( BaseRunner, ExecutionResult, GraphExecutor, OutputSeries) @@ -85,6 +85,9 @@ def training_loop(cfg: Namespace) -> None: trainer_result = cfg.tf_manager.execute( batch, feedables, cfg.trainers, train=True, summaries=True) + # workaround: we need to use validation batching scheme + # during evaluation + batch.batching = BatchingScheme(batch_size=cfg.batch_size) train_results, train_outputs, f_batch = run_on_dataset( cfg.tf_manager, cfg.runners, cfg.dataset_runner, batch, cfg.postprocess, write_out=False) From 7a623121889ceef2168f491bff18f896aef3f56f Mon Sep 17 00:00:00 2001 From: Dusan Varis Date: Wed, 30 Jan 2019 13:15:52 +0100 Subject: [PATCH 2/3] added batching schemes from tensor2tensor --- neuralmonkey/dataset.py | 88 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/neuralmonkey/dataset.py b/neuralmonkey/dataset.py index ae963d75d..8a80502f0 100644 --- a/neuralmonkey/dataset.py +++ b/neuralmonkey/dataset.py @@ -95,6 +95,94 @@ def __init__(self, # pylint: enable=too-few-public-methods +def _bucket_boundaries(max_length, min_length=8, length_bucket_step=1.1): + """A default set of length-bucket boundaries.""" + assert length_bucket_step > 1.0 + x = min_length + boundaries = [] + while x < max_length: + boundaries.append(x) + x = max(x + 1, int(x * length_bucket_step)) + return boundaries + + +def get_batching_scheme(batch_size: int, + max_length: int = None, + min_length_bucket: int = 8, + length_bucket_step: float = 1.1, + drop_long_sequences: bool = False, + shard_multiplier: int = 1, + length_multiplier: int = 1, + min_length: int = 0) -> BatchingScheme: + """A batching scheme based on model hyperparameters. + Every batch contains a number of sequences divisible by `shard_multiplier`. + Args: + batch_size: int, total number of tokens in a batch. + max_length: int, sequences longer than this will be skipped. Defaults to + batch_size. + min_length_bucket: int + length_bucket_step: float greater than 1.0 + drop_long_sequences: bool, if True, then sequences longer than + `max_length` are dropped. This prevents generating batches with + more than the usual number of tokens, which can cause out-of-memory + errors. + shard_multiplier: an integer increasing the batch_size to suit splitting + across datashards. + length_multiplier: an integer multiplier that is used to increase the + batch sizes and sequence length tolerance. + min_length: int, sequences shorter than this will be skipped. + Returns: + A dictionary with parameters that can be passed to input_pipeline: + * boundaries: list of bucket boundaries + * batch_sizes: list of batch sizes for each length bucket + * max_length: int, maximum length of an example + Raises: + ValueError: If min_length > max_length + """ + max_length = max_length or batch_size + if max_length < min_length: + raise ValueError("max_length must be greater or equal to min_length") + + boundaries = _bucket_boundaries(max_length, min_length_bucket, + length_bucket_step) + boundaries = [boundary * length_multiplier for boundary in boundaries] + max_length *= length_multiplier + + batch_sizes = [ + max(1, batch_size // length) for length in boundaries + [max_length] + ] + max_batch_size = max(batch_sizes) + # Since the Datasets API only allows a single constant for window_size, + # and it needs divide all bucket_batch_sizes, we pick a highly-composite + # window size and then round down all batch sizes to divisors of that window + # size, so that a window can always be divided evenly into batches. + # TODO(noam): remove this when Dataset API improves. + highly_composite_numbers = [ + 1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680, + 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, 50400, 55440, + 83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280, + 720720, 1081080, 1441440, 2162160, 2882880, 3603600, 4324320, 6486480, + 7207200, 8648640, 10810800, 14414400, 17297280, 21621600, 32432400, + 36756720, 43243200, 61261200, 73513440, 110270160 + ] + window_size = max( + [i for i in highly_composite_numbers if i <= 3 * max_batch_size]) + divisors = [i for i in range(1, window_size + 1) if window_size % i == 0] + batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes] + window_size *= shard_multiplier + batch_sizes = [bs * shard_multiplier for bs in batch_sizes] + # The Datasets API splits one window into multiple batches, which + # produces runs of many consecutive batches of the same size. This + # is bad for training. To solve this, we will shuffle the batches + # using a queue which must be several times as large as the maximum + # number of batches per window. + max_batches_per_window = window_size // min(batch_sizes) + shuffle_queue_size = max_batches_per_window * 3 + + ret = BatchingScheme(bucket_boundaries=boundaries, + bucket_batch_sizes=batch_sizes) + return ret + # The protected functions below are designed to convert the ambiguous spec # structures to a normalized form. From 1d968b5b7a15edca8c1de5e0d5f98ebd880b849d Mon Sep 17 00:00:00 2001 From: Dusan Varis Date: Wed, 30 Jan 2019 13:37:53 +0100 Subject: [PATCH 3/3] fixing failed travis tests --- neuralmonkey/dataset.py | 50 ++++++++------------ neuralmonkey/readers/string_vector_reader.py | 2 +- tests/hier-multiattention.ini | 1 + 3 files changed, 22 insertions(+), 31 deletions(-) diff --git a/neuralmonkey/dataset.py b/neuralmonkey/dataset.py index 8a80502f0..410d2ec07 100644 --- a/neuralmonkey/dataset.py +++ b/neuralmonkey/dataset.py @@ -96,7 +96,7 @@ def __init__(self, def _bucket_boundaries(max_length, min_length=8, length_bucket_step=1.1): - """A default set of length-bucket boundaries.""" + """Create a default set of length-bucket boundaries.""" assert length_bucket_step > 1.0 x = min_length boundaries = [] @@ -110,28 +110,25 @@ def get_batching_scheme(batch_size: int, max_length: int = None, min_length_bucket: int = 8, length_bucket_step: float = 1.1, - drop_long_sequences: bool = False, shard_multiplier: int = 1, length_multiplier: int = 1, min_length: int = 0) -> BatchingScheme: - """A batching scheme based on model hyperparameters. + """Create a batching scheme based on model hyperparameters. + Every batch contains a number of sequences divisible by `shard_multiplier`. + Args: batch_size: int, total number of tokens in a batch. - max_length: int, sequences longer than this will be skipped. Defaults to - batch_size. + max_length: int, sequences longer than this will be skipped. Defaults + to batch_size. min_length_bucket: int length_bucket_step: float greater than 1.0 - drop_long_sequences: bool, if True, then sequences longer than - `max_length` are dropped. This prevents generating batches with - more than the usual number of tokens, which can cause out-of-memory - errors. - shard_multiplier: an integer increasing the batch_size to suit splitting - across datashards. + shard_multiplier: an integer increasing the batch_size to suit + splitting across datashards. length_multiplier: an integer multiplier that is used to increase the batch sizes and sequence length tolerance. min_length: int, sequences shorter than this will be skipped. - Returns: + Return: A dictionary with parameters that can be passed to input_pipeline: * boundaries: list of bucket boundaries * batch_sizes: list of batch sizes for each length bucket @@ -149,40 +146,33 @@ def get_batching_scheme(batch_size: int, max_length *= length_multiplier batch_sizes = [ - max(1, batch_size // length) for length in boundaries + [max_length] + max(1, batch_size // length) for length in boundaries + [max_length] ] max_batch_size = max(batch_sizes) # Since the Datasets API only allows a single constant for window_size, # and it needs divide all bucket_batch_sizes, we pick a highly-composite - # window size and then round down all batch sizes to divisors of that window - # size, so that a window can always be divided evenly into batches. - # TODO(noam): remove this when Dataset API improves. + # window size and then round down all batch sizes to divisors of that + # window size, so that a window can always be divided evenly into batches. highly_composite_numbers = [ - 1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680, - 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, 50400, 55440, - 83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280, - 720720, 1081080, 1441440, 2162160, 2882880, 3603600, 4324320, 6486480, - 7207200, 8648640, 10810800, 14414400, 17297280, 21621600, 32432400, - 36756720, 43243200, 61261200, 73513440, 110270160 + 1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, + 1680, 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, + 50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960, + 554400, 665280, 720720, 1081080, 1441440, 2162160, 2882880, 3603600, + 4324320, 6486480, 7207200, 8648640, 10810800, 14414400, 17297280, + 21621600, 32432400, 36756720, 43243200, 61261200, 73513440, 110270160 ] window_size = max( - [i for i in highly_composite_numbers if i <= 3 * max_batch_size]) + [i for i in highly_composite_numbers if i <= 3 * max_batch_size]) divisors = [i for i in range(1, window_size + 1) if window_size % i == 0] batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes] window_size *= shard_multiplier batch_sizes = [bs * shard_multiplier for bs in batch_sizes] - # The Datasets API splits one window into multiple batches, which - # produces runs of many consecutive batches of the same size. This - # is bad for training. To solve this, we will shuffle the batches - # using a queue which must be several times as large as the maximum - # number of batches per window. - max_batches_per_window = window_size // min(batch_sizes) - shuffle_queue_size = max_batches_per_window * 3 ret = BatchingScheme(bucket_boundaries=boundaries, bucket_batch_sizes=batch_sizes) return ret + # The protected functions below are designed to convert the ambiguous spec # structures to a normalized form. diff --git a/neuralmonkey/readers/string_vector_reader.py b/neuralmonkey/readers/string_vector_reader.py index d6545b2a3..439a23838 100644 --- a/neuralmonkey/readers/string_vector_reader.py +++ b/neuralmonkey/readers/string_vector_reader.py @@ -13,7 +13,7 @@ def process_line(line: str, lineno: int, path: str) -> np.ndarray: return np.array(numbers, dtype=dtype) - def reader(files: List[str])-> Iterable[List[np.ndarray]]: + def reader(files: List[str]) -> Iterable[List[np.ndarray]]: for path in files: current_line = 0 diff --git a/tests/hier-multiattention.ini b/tests/hier-multiattention.ini index f4a4b5c68..f203ab665 100644 --- a/tests/hier-multiattention.ini +++ b/tests/hier-multiattention.ini @@ -4,6 +4,7 @@ tf_manager= output="tests/outputs/hier-multiattention" overwrite_output_dir=True epochs=1 +batch_size=1 train_dataset= val_dataset= trainer=