From 299c1bca4c27dc16544820569a57ab27565ecc73 Mon Sep 17 00:00:00 2001
From: Dusan Varis <varis@ufal.mff.cuni.cz>
Date: Wed, 9 Jan 2019 16:29:51 +0100
Subject: [PATCH 1/3] workaround for train_set batching  during inference time

---
 neuralmonkey/learning_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/neuralmonkey/learning_utils.py b/neuralmonkey/learning_utils.py
index 50e0e0711..102eafb42 100644
--- a/neuralmonkey/learning_utils.py
+++ b/neuralmonkey/learning_utils.py
@@ -13,7 +13,7 @@
 from termcolor import colored
 
 from neuralmonkey.logging import log, log_print, warn
-from neuralmonkey.dataset import Dataset
+from neuralmonkey.dataset import Dataset, BatchingScheme
 from neuralmonkey.tf_manager import TensorFlowManager
 from neuralmonkey.runners.base_runner import (
     BaseRunner, ExecutionResult, GraphExecutor, OutputSeries)
@@ -85,6 +85,9 @@ def training_loop(cfg: Namespace) -> None:
                     trainer_result = cfg.tf_manager.execute(
                         batch, feedables, cfg.trainers, train=True,
                         summaries=True)
+                    # workaround: we need to use validation batching scheme
+                    #             during evaluation
+                    batch.batching = BatchingScheme(batch_size=cfg.batch_size)
                     train_results, train_outputs, f_batch = run_on_dataset(
                         cfg.tf_manager, cfg.runners, cfg.dataset_runner, batch,
                         cfg.postprocess, write_out=False)

From 7a623121889ceef2168f491bff18f896aef3f56f Mon Sep 17 00:00:00 2001
From: Dusan Varis <varis@ufal.mff.cuni.cz>
Date: Wed, 30 Jan 2019 13:15:52 +0100
Subject: [PATCH 2/3] added batching schemes from tensor2tensor

---
 neuralmonkey/dataset.py | 88 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/neuralmonkey/dataset.py b/neuralmonkey/dataset.py
index ae963d75d..8a80502f0 100644
--- a/neuralmonkey/dataset.py
+++ b/neuralmonkey/dataset.py
@@ -95,6 +95,94 @@ def __init__(self,
 # pylint: enable=too-few-public-methods
 
 
+def _bucket_boundaries(max_length, min_length=8, length_bucket_step=1.1):
+    """A default set of length-bucket boundaries."""
+    assert length_bucket_step > 1.0
+    x = min_length
+    boundaries = []
+    while x < max_length:
+        boundaries.append(x)
+        x = max(x + 1, int(x * length_bucket_step))
+    return boundaries
+
+
+def get_batching_scheme(batch_size: int,
+                        max_length: int = None,
+                        min_length_bucket: int = 8,
+                        length_bucket_step: float = 1.1,
+                        drop_long_sequences: bool = False,
+                        shard_multiplier: int = 1,
+                        length_multiplier: int = 1,
+                        min_length: int = 0) -> BatchingScheme:
+    """A batching scheme based on model hyperparameters.
+    Every batch contains a number of sequences divisible by `shard_multiplier`.
+    Args:
+        batch_size: int, total number of tokens in a batch.
+        max_length: int, sequences longer than this will be skipped. Defaults to
+            batch_size.
+        min_length_bucket: int
+        length_bucket_step: float greater than 1.0
+        drop_long_sequences: bool, if True, then sequences longer than
+            `max_length` are dropped.  This prevents generating batches with
+            more than the usual number of tokens, which can cause out-of-memory
+            errors.
+        shard_multiplier: an integer increasing the batch_size to suit splitting
+            across datashards.
+        length_multiplier: an integer multiplier that is used to increase the
+            batch sizes and sequence length tolerance.
+        min_length: int, sequences shorter than this will be skipped.
+    Returns:
+         A dictionary with parameters that can be passed to input_pipeline:
+             * boundaries: list of bucket boundaries
+             * batch_sizes: list of batch sizes for each length bucket
+             * max_length: int, maximum length of an example
+    Raises:
+        ValueError: If min_length > max_length
+    """
+    max_length = max_length or batch_size
+    if max_length < min_length:
+        raise ValueError("max_length must be greater or equal to min_length")
+
+    boundaries = _bucket_boundaries(max_length, min_length_bucket,
+                                    length_bucket_step)
+    boundaries = [boundary * length_multiplier for boundary in boundaries]
+    max_length *= length_multiplier
+
+    batch_sizes = [
+          max(1, batch_size // length) for length in boundaries + [max_length]
+    ]
+    max_batch_size = max(batch_sizes)
+    # Since the Datasets API only allows a single constant for window_size,
+    # and it needs divide all bucket_batch_sizes, we pick a highly-composite
+    # window size and then round down all batch sizes to divisors of that window
+    # size, so that a window can always be divided evenly into batches.
+    # TODO(noam): remove this when Dataset API improves.
+    highly_composite_numbers = [
+          1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680,
+          2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, 50400, 55440,
+          83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280,
+          720720, 1081080, 1441440, 2162160, 2882880, 3603600, 4324320, 6486480,
+          7207200, 8648640, 10810800, 14414400, 17297280, 21621600, 32432400,
+          36756720, 43243200, 61261200, 73513440, 110270160
+    ]
+    window_size = max(
+          [i for i in highly_composite_numbers if i <= 3 * max_batch_size])
+    divisors = [i for i in range(1, window_size + 1) if window_size % i == 0]
+    batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes]
+    window_size *= shard_multiplier
+    batch_sizes = [bs * shard_multiplier for bs in batch_sizes]
+    # The Datasets API splits one window into multiple batches, which
+    # produces runs of many consecutive batches of the same size.  This
+    # is bad for training.  To solve this, we will shuffle the batches
+    # using a queue which must be several times as large as the maximum
+    # number of batches per window.
+    max_batches_per_window = window_size // min(batch_sizes)
+    shuffle_queue_size = max_batches_per_window * 3
+
+    ret = BatchingScheme(bucket_boundaries=boundaries,
+                         bucket_batch_sizes=batch_sizes)
+    return ret
+
 # The protected functions below are designed to convert the ambiguous spec
 # structures to a normalized form.
 

From 1d968b5b7a15edca8c1de5e0d5f98ebd880b849d Mon Sep 17 00:00:00 2001
From: Dusan Varis <varis@ufal.mff.cuni.cz>
Date: Wed, 30 Jan 2019 13:37:53 +0100
Subject: [PATCH 3/3] fixing failed travis tests

---
 neuralmonkey/dataset.py                      | 50 ++++++++------------
 neuralmonkey/readers/string_vector_reader.py |  2 +-
 tests/hier-multiattention.ini                |  1 +
 3 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/neuralmonkey/dataset.py b/neuralmonkey/dataset.py
index 8a80502f0..410d2ec07 100644
--- a/neuralmonkey/dataset.py
+++ b/neuralmonkey/dataset.py
@@ -96,7 +96,7 @@ def __init__(self,
 
 
 def _bucket_boundaries(max_length, min_length=8, length_bucket_step=1.1):
-    """A default set of length-bucket boundaries."""
+    """Create a default set of length-bucket boundaries."""
     assert length_bucket_step > 1.0
     x = min_length
     boundaries = []
@@ -110,28 +110,25 @@ def get_batching_scheme(batch_size: int,
                         max_length: int = None,
                         min_length_bucket: int = 8,
                         length_bucket_step: float = 1.1,
-                        drop_long_sequences: bool = False,
                         shard_multiplier: int = 1,
                         length_multiplier: int = 1,
                         min_length: int = 0) -> BatchingScheme:
-    """A batching scheme based on model hyperparameters.
+    """Create a batching scheme based on model hyperparameters.
+
     Every batch contains a number of sequences divisible by `shard_multiplier`.
+
     Args:
         batch_size: int, total number of tokens in a batch.
-        max_length: int, sequences longer than this will be skipped. Defaults to
-            batch_size.
+        max_length: int, sequences longer than this will be skipped. Defaults
+            to batch_size.
         min_length_bucket: int
         length_bucket_step: float greater than 1.0
-        drop_long_sequences: bool, if True, then sequences longer than
-            `max_length` are dropped.  This prevents generating batches with
-            more than the usual number of tokens, which can cause out-of-memory
-            errors.
-        shard_multiplier: an integer increasing the batch_size to suit splitting
-            across datashards.
+        shard_multiplier: an integer increasing the batch_size to suit
+            splitting across datashards.
         length_multiplier: an integer multiplier that is used to increase the
             batch sizes and sequence length tolerance.
         min_length: int, sequences shorter than this will be skipped.
-    Returns:
+    Return:
          A dictionary with parameters that can be passed to input_pipeline:
              * boundaries: list of bucket boundaries
              * batch_sizes: list of batch sizes for each length bucket
@@ -149,40 +146,33 @@ def get_batching_scheme(batch_size: int,
     max_length *= length_multiplier
 
     batch_sizes = [
-          max(1, batch_size // length) for length in boundaries + [max_length]
+        max(1, batch_size // length) for length in boundaries + [max_length]
     ]
     max_batch_size = max(batch_sizes)
     # Since the Datasets API only allows a single constant for window_size,
     # and it needs divide all bucket_batch_sizes, we pick a highly-composite
-    # window size and then round down all batch sizes to divisors of that window
-    # size, so that a window can always be divided evenly into batches.
-    # TODO(noam): remove this when Dataset API improves.
+    # window size and then round down all batch sizes to divisors of that
+    # window size, so that a window can always be divided evenly into batches.
     highly_composite_numbers = [
-          1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680,
-          2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, 50400, 55440,
-          83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280,
-          720720, 1081080, 1441440, 2162160, 2882880, 3603600, 4324320, 6486480,
-          7207200, 8648640, 10810800, 14414400, 17297280, 21621600, 32432400,
-          36756720, 43243200, 61261200, 73513440, 110270160
+        1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260,
+        1680, 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360,
+        50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960,
+        554400, 665280, 720720, 1081080, 1441440, 2162160, 2882880, 3603600,
+        4324320, 6486480, 7207200, 8648640, 10810800, 14414400, 17297280,
+        21621600, 32432400, 36756720, 43243200, 61261200, 73513440, 110270160
     ]
     window_size = max(
-          [i for i in highly_composite_numbers if i <= 3 * max_batch_size])
+        [i for i in highly_composite_numbers if i <= 3 * max_batch_size])
     divisors = [i for i in range(1, window_size + 1) if window_size % i == 0]
     batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes]
     window_size *= shard_multiplier
     batch_sizes = [bs * shard_multiplier for bs in batch_sizes]
-    # The Datasets API splits one window into multiple batches, which
-    # produces runs of many consecutive batches of the same size.  This
-    # is bad for training.  To solve this, we will shuffle the batches
-    # using a queue which must be several times as large as the maximum
-    # number of batches per window.
-    max_batches_per_window = window_size // min(batch_sizes)
-    shuffle_queue_size = max_batches_per_window * 3
 
     ret = BatchingScheme(bucket_boundaries=boundaries,
                          bucket_batch_sizes=batch_sizes)
     return ret
 
+
 # The protected functions below are designed to convert the ambiguous spec
 # structures to a normalized form.
 
diff --git a/neuralmonkey/readers/string_vector_reader.py b/neuralmonkey/readers/string_vector_reader.py
index d6545b2a3..439a23838 100644
--- a/neuralmonkey/readers/string_vector_reader.py
+++ b/neuralmonkey/readers/string_vector_reader.py
@@ -13,7 +13,7 @@ def process_line(line: str, lineno: int, path: str) -> np.ndarray:
 
         return np.array(numbers, dtype=dtype)
 
-    def reader(files: List[str])-> Iterable[List[np.ndarray]]:
+    def reader(files: List[str]) -> Iterable[List[np.ndarray]]:
         for path in files:
             current_line = 0
 
diff --git a/tests/hier-multiattention.ini b/tests/hier-multiattention.ini
index f4a4b5c68..f203ab665 100644
--- a/tests/hier-multiattention.ini
+++ b/tests/hier-multiattention.ini
@@ -4,6 +4,7 @@ tf_manager=<tf_manager>
 output="tests/outputs/hier-multiattention"
 overwrite_output_dir=True
 epochs=1
+batch_size=1
 train_dataset=<train_data>
 val_dataset=<val_data>
 trainer=<trainer>