diff --git a/.coveragerc b/.coveragerc
index 5f13f8d3fea..095caa82210 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -15,6 +15,7 @@ fail_under = 85
 show_missing = True
 omit =
     keras/applications/*
+    keras/preprocessing/*
     keras/datasets/*
     keras/layers/cudnn_recurrent.py
     keras/legacy/*
diff --git a/.github/ISSUE_TEMPLATE/a--tensorflow-backend-users.md b/.github/ISSUE_TEMPLATE/a--tensorflow-backend-users.md
new file mode 100644
index 00000000000..a20c78f228d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/a--tensorflow-backend-users.md
@@ -0,0 +1,17 @@
+---
+name: a) TensorFlow backend users
+about: Select this is you're using Keras with the TensorFlow backend (default).
+
+---
+
+Please make sure that the boxes below are checked before you submit your issue.
+If your issue is an **implementation question**, please ask your question on [StackOverflow](http://stackoverflow.com/questions/tagged/keras) or [on the Keras Slack channel](https://keras-slack-autojoin.herokuapp.com/) instead of opening a GitHub issue.
+
+Thank you!
+
+- [ ] Check that you are up-to-date with the master branch of Keras. You can update with:
+`pip install git+git://github.com/keras-team/keras.git --upgrade --no-deps`
+
+- [ ] Check that your version of TensorFlow is up-to-date. The installation instructions can be found [here](https://www.tensorflow.org/get_started/os_setup).
+
+- [ ] Provide a link to a GitHub Gist of a Python script that can reproduce your issue (or just copy the script here if it is short).
diff --git a/.github/ISSUE_TEMPLATE/b--theano-backend-users.md b/.github/ISSUE_TEMPLATE/b--theano-backend-users.md
new file mode 100644
index 00000000000..81fa2b43e7d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/b--theano-backend-users.md
@@ -0,0 +1,18 @@
+---
+name: b) Theano backend users
+about: Select this if you're using Keras with the Theano backend.
+
+---
+
+Please make sure that the boxes below are checked before you submit your issue.
+If your issue is an **implementation question**, please ask your question on [StackOverflow](http://stackoverflow.com/questions/tagged/keras) or [on the Keras Slack channel](https://keras-slack-autojoin.herokuapp.com/) instead of opening a GitHub issue.
+
+Thank you!
+
+- [ ] Check that you are up-to-date with the master branch of Keras. You can update with:
+`pip install git+git://github.com/keras-team/keras.git --upgrade --no-deps`
+
+- [ ] Check that you are up-to-date with the master branch of Theano. You can update with:
+`pip install git+git://github.com/Theano/Theano.git --upgrade --no-deps`
+
+- [ ] Provide a link to a GitHub Gist of a Python script that can reproduce your issue (or just copy the script here if it is short).
diff --git a/.github/ISSUE_TEMPLATE/c--cntk-backend-users.md b/.github/ISSUE_TEMPLATE/c--cntk-backend-users.md
new file mode 100644
index 00000000000..8d94610166c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/c--cntk-backend-users.md
@@ -0,0 +1,17 @@
+---
+name: c) CNTK backend users
+about: Select this if you're using Keras with the CNTK backend.
+
+---
+
+Please make sure that the boxes below are checked before you submit your issue.
+If your issue is an **implementation question**, please ask your question on [StackOverflow](http://stackoverflow.com/questions/tagged/keras) or [on the Keras Slack channel](https://keras-slack-autojoin.herokuapp.com/) instead of opening a GitHub issue.
+
+Thank you!
+
+- [ ] Check that you are up-to-date with the master branch of Keras. You can update with:
+`pip install git+git://github.com/keras-team/keras.git --upgrade --no-deps`
+
+- [ ] Check that your version of CNTK is up-to-date.
+
+- [ ] Provide a link to a GitHub Gist of a Python script that can reproduce your issue (or just copy the script here if it is short).
diff --git a/.travis.yml b/.travis.yml
index ac668eb31f1..e0b4766c590 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,14 +1,15 @@
 sudo: required
 dist: trusty
 language: python
+cache:
+  directories:
+    - $HOME/.theano
 matrix:
     include:
-        - python: 2.7
-          env: KERAS_BACKEND=tensorflow TEST_MODE=PEP8
         - python: 2.7
           env: KERAS_BACKEND=tensorflow TEST_MODE=INTEGRATION_TESTS
         - python: 3.6
-          env: KERAS_BACKEND=tensorflow TEST_MODE=DOC
+          env: KERAS_BACKEND=tensorflow TEST_MODE=PEP8_DOC
         - python: 2.7
           env: KERAS_BACKEND=tensorflow
         - python: 3.6
@@ -21,10 +22,6 @@ matrix:
           env: KERAS_BACKEND=cntk PYTHONWARNINGS=ignore
         - python: 3.6
           env: KERAS_BACKEND=cntk PYTHONWARNINGS=ignore
-        - python: 2.7
-          env: KERAS_BACKEND=mxnet PYTHONWARNINGS=ignore
-        - python: 3.6
-          env: KERAS_BACKEND=mxnet PYTHONWARNINGS=ignore
 install:
   # code below is taken from http://conda.pydata.org/docs/travis.html
   # We do this conditionally because it saves us some downloading if the
@@ -42,40 +39,33 @@ install:
   # Useful for debugging any issues with conda
   - conda info -a
 
-  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION nose scipy matplotlib pandas pytest h5py
+  - travis_retry conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION
   - source activate test-environment
-  - pip install --only-binary=numpy,scipy numpy nose scipy matplotlib h5py theano
+  - travis_retry pip install --only-binary=numpy,scipy,pandas numpy nose scipy matplotlib h5py theano pytest pytest-pep8 pandas
   - pip install keras_applications keras_preprocessing
-  - conda install mkl mkl-service
 
   # set library path
   - export LD_LIBRARY_PATH=$HOME/miniconda/envs/test-environment/lib/:$LD_LIBRARY_PATH
 
-  # install PIL for preprocessing tests
-  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
-      conda install pil;
-    else
-      conda install Pillow;
+  # install PIL for preprocessing tests (they are integration tests).
+  - if [[ "$TEST_MODE" == "INTEGRATION_TESTS" ]] || [[ "$TEST_MODE" == "PEP8_DOC" ]]; then
+      if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
+        export PIL=Pil;
+      else
+        export PIL=Pillow;
+      fi
     fi
 
+    # install pydot for visualization tests
+  - travis_retry conda install mkl mkl-service pydot graphviz $PIL
+
   - pip install -e .[tests]
 
   # install TensorFlow (CPU version).
-  - pip install tensorflow==1.7
-  
-    # install Apache MXNet (CPU version).
-  - pip install mxnet
-  - pip install --upgrade numpy
-  
-  # install cntk
-  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
-      pip install https://cntk.ai/PythonWheel/CPU-Only/cntk-2.5.1-cp27-cp27mu-linux_x86_64.whl;
-    elif [[ "$TRAVIS_PYTHON_VERSION" == "3.6" ]]; then
-      pip install https://cntk.ai/PythonWheel/CPU-Only/cntk-2.5.1-cp36-cp36m-linux_x86_64.whl;
-    fi
+  - pip install tensorflow==1.9
 
-  # install pydot for visualization tests
-  - conda install pydot graphviz
+  # install cntk
+  - pip install cntk
 
   # exclude different backends to measure a coverage for the designated backend only
   - if [[ "$KERAS_BACKEND" != "tensorflow" ]]; then
@@ -87,9 +77,6 @@ install:
   - if [[ "$KERAS_BACKEND" != "cntk" ]]; then
       echo '    keras/backend/cntk_backend.py' >> .coveragerc;
     fi
-  - if [[ "$KERAS_BACKEND" != "mxnet" ]]; then
-      echo '    keras/backend/mxnet_backend.py' >> .coveragerc;
-    fi
 
   # detect whether core files are changed or not
   - export CORE_CHANGED=False;
@@ -118,10 +105,8 @@ script:
   - echo -e "Running tests with the following config:\n$(cat ~/.keras/keras.json)"
   - if [[ "$TEST_MODE" == "INTEGRATION_TESTS" ]]; then
       PYTHONPATH=$PWD:$PYTHONPATH py.test tests/integration_tests;
-    elif [[ "$TEST_MODE" == "PEP8" ]]; then
-      PYTHONPATH=$PWD:$PYTHONPATH py.test --pep8 -m pep8 -n0;
-    elif [[ "$TEST_MODE" == "DOC" ]]; then
-      PYTHONPATH=$PWD:$PYTHONPATH py.test tests/test_documentation.py;
+    elif [[ "$TEST_MODE" == "PEP8_DOC" ]]; then
+      PYTHONPATH=$PWD:$PYTHONPATH py.test --pep8 -m pep8 -n0 && py.test tests/test_documentation.py;
     else
       PYTHONPATH=$PWD:$PYTHONPATH py.test tests/ --ignore=tests/integration_tests --ignore=tests/test_documentation.py --ignore=tests/keras/legacy/layers_test.py --cov-config .coveragerc --cov=keras tests/;
     fi
\ No newline at end of file
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 749f251af3b..b8985387bbf 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -41,8 +41,8 @@ RUN conda install -y python=${python_version} && \
     pip install --upgrade pip && \
     pip install \
       sklearn_pandas \
-      tensorflow-gpu && \
-    pip install https://cntk.ai/PythonWheel/GPU/cntk-2.1-cp36-cp36m-linux_x86_64.whl && \
+      tensorflow-gpu \
+      cntk-gpu && \
     conda install \
       bcolz \
       h5py \
@@ -52,6 +52,7 @@ RUN conda install -y python=${python_version} && \
       notebook \
       Pillow \
       pandas \
+      pydot \
       pygpu \
       pyyaml \
       scikit-learn \
diff --git a/docs/autogen.py b/docs/autogen.py
index b70f1d5fef9..f8fcb380f41 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -91,6 +91,7 @@
 
 EXCLUDE = {
     'Optimizer',
+    'TFOptimizer',
     'Wrapper',
     'get_session',
     'set_session',
@@ -173,6 +174,7 @@
             layers.SeparableConv2D,
             layers.Conv2DTranspose,
             layers.Conv3D,
+            layers.Conv3DTranspose,
             layers.Cropping1D,
             layers.Cropping2D,
             layers.Cropping3D,
diff --git a/docs/templates/applications.md b/docs/templates/applications.md
index 84fc176c0a9..6613afbf30f 100644
--- a/docs/templates/applications.md
+++ b/docs/templates/applications.md
@@ -172,17 +172,19 @@ model = InceptionV3(input_tensor=input_tensor, weights='imagenet', include_top=T
 
 | Model | Size | Top-1 Accuracy | Top-5 Accuracy | Parameters | Depth |
 | ----- | ----: | --------------: | --------------: | ----------: | -----: |
-| [Xception](#xception) | 88 MB | 0.790 | 0.945| 22,910,480 | 126 |
-| [VGG16](#vgg16) | 528 MB| 0.715 | 0.901 | 138,357,544 | 23
-| [VGG19](#vgg19) | 549 MB | 0.727 | 0.910 | 143,667,240 | 26
-| [ResNet50](#resnet50) | 99 MB | 0.759 | 0.929 | 25,636,712 | 168
-| [InceptionV3](#inceptionv3) | 92 MB | 0.788 | 0.944 | 23,851,784 | 159 |
-| [InceptionResNetV2](#inceptionresnetv2) | 215 MB | 0.804 | 0.953 | 55,873,736 | 572 |
-| [MobileNet](#mobilenet) | 17 MB | 0.665 | 0.871 | 4,253,864 | 88
-| [DenseNet121](#densenet) | 33 MB | 0.745 | 0.918 | 8,062,504 | 121
-| [DenseNet169](#densenet) | 57 MB | 0.759 | 0.928 | 14,307,880 | 169
-| [DenseNet201](#densenet) | 80 MB | 0.770 | 0.933 | 20,242,984 | 201
-
+| [Xception](#xception) | 88 MB | 0.790 | 0.945 | 22,910,480 | 126 |
+| [VGG16](#vgg16) | 528 MB | 0.713 | 0.901 | 138,357,544 | 23 |
+| [VGG19](#vgg19) | 549 MB | 0.713 | 0.900 | 143,667,240 | 26 |
+| [ResNet50](#resnet50) | 99 MB | 0.749 | 0.921 | 25,636,712 | 168 |
+| [InceptionV3](#inceptionv3) | 92 MB | 0.779 | 0.937 | 23,851,784 | 159 |
+| [InceptionResNetV2](#inceptionresnetv2) | 215 MB | 0.803 | 0.953 | 55,873,736 | 572 |
+| [MobileNet](#mobilenet) | 16 MB | 0.704 | 0.895 | 4,253,864 | 88 |
+| [MobileNetV2](#mobilenetv2) | 14 MB | 0.713 | 0.901 | 3,538,984 | 88 |
+| [DenseNet121](#densenet) | 33 MB | 0.750 | 0.923 | 8,062,504 | 121 |
+| [DenseNet169](#densenet) | 57 MB | 0.762 | 0.932 | 14,307,880 | 169 |
+| [DenseNet201](#densenet) | 80 MB | 0.773 | 0.936 | 20,242,984 | 201 |
+| [NASNetMobile](#nasnet) | 23 MB | 0.744 | 0.919 | 5,326,716 | - |
+| [NASNetLarge](#nasnet) | 343 MB | 0.825 | 0.960 | 88,949,818 | - |
 
 The top-1 and top-5 accuracy refers to the model's performance on the ImageNet validation dataset.
 
@@ -269,7 +271,7 @@ The default input size for this model is 224x224.
     has to be `(224, 224, 3)` (with `'channels_last'` data format)
     or `(3, 224, 224)` (with `'channels_first'` data format).
     It should have exactly 3 inputs channels,
-    and width and height should be no smaller than 48.
+    and width and height should be no smaller than 32.
     E.g. `(200, 200, 3)` would be one valid value.
 - pooling: Optional pooling mode for feature extraction
     when `include_top` is `False`.
@@ -324,7 +326,7 @@ The default input size for this model is 224x224.
     has to be `(224, 224, 3)` (with `'channels_last'` data format)
     or `(3, 224, 224)` (with `'channels_first'` data format).
     It should have exactly 3 inputs channels,
-    and width and height should be no smaller than 48.
+    and width and height should be no smaller than 32.
     E.g. `(200, 200, 3)` would be one valid value.
 - pooling: Optional pooling mode for feature extraction
     when `include_top` is `False`.
@@ -381,7 +383,7 @@ The default input size for this model is 224x224.
     has to be `(224, 224, 3)` (with `'channels_last'` data format)
     or `(3, 224, 224)` (with `'channels_first'` data format).
     It should have exactly 3 inputs channels,
-    and width and height should be no smaller than 197.
+    and width and height should be no smaller than 32.
     E.g. `(200, 200, 3)` would be one valid value.
 - pooling: Optional pooling mode for feature extraction
     when `include_top` is `False`.
@@ -436,7 +438,7 @@ The default input size for this model is 299x299.
     has to be `(299, 299, 3)` (with `'channels_last'` data format)
     or `(3, 299, 299)` (with `'channels_first'` data format).
     It should have exactly 3 inputs channels,
-    and width and height should be no smaller than 139.
+    and width and height should be no smaller than 75.
     E.g. `(150, 150, 3)` would be one valid value.
 - pooling: Optional pooling mode for feature extraction
     when `include_top` is `False`.
@@ -491,7 +493,7 @@ The default input size for this model is 299x299.
     has to be `(299, 299, 3)` (with `'channels_last'` data format)
     or `(3, 299, 299)` (with `'channels_first'` data format).
     It should have exactly 3 inputs channels,
-    and width and height should be no smaller than 139.
+    and width and height should be no smaller than 75.
     E.g. `(150, 150, 3)` would be one valid value.
 - pooling: Optional pooling mode for feature extraction
     when `include_top` is `False`.
@@ -618,9 +620,11 @@ The default input size for this model is 224x224.
     to use as image input for the model.
 - input_shape: optional shape tuple, only to be specified
     if `include_top` is False (otherwise the input shape
-    has to be `(224, 224, 3)` (with `channels_last` data format)
-    or `(3, 224, 224)` (with `channels_first` data format).
-    It should have exactly 3 inputs channels.
+    has to be `(224, 224, 3)` (with `'channels_last'` data format)
+    or `(3, 224, 224)` (with `'channels_first'` data format).
+    It should have exactly 3 inputs channels,
+    and width and height should be no smaller than 32.
+    E.g. `(200, 200, 3)` would be one valid value.
 - pooling: optional pooling mode for feature extraction
     when `include_top` is `False`.
     - `None` means that the output of the model will be
diff --git a/docs/templates/datasets.md b/docs/templates/datasets.md
index a2f1d1a1eab..a3548394379 100644
--- a/docs/templates/datasets.md
+++ b/docs/templates/datasets.md
@@ -14,7 +14,7 @@ from keras.datasets import cifar10
 
 - __Returns:__
     - 2 tuples:
-        - __x_train, x_test__: uint8 array of RGB image data with shape (num_samples, 3, 32, 32).
+        - __x_train, x_test__: uint8 array of RGB image data with shape (num_samples, 3, 32, 32) or (num_samples, 32, 32, 3) based on the `image_data_format` backend setting of either `channels_first` or `channels_last` respectively.
         - __y_train, y_test__: uint8 array of category labels (integers in range 0-9) with shape (num_samples,).
 
 
@@ -34,7 +34,7 @@ from keras.datasets import cifar100
 
 - __Returns:__
     - 2 tuples:
-        - __x_train, x_test__: uint8 array of RGB image data with shape (num_samples, 3, 32, 32).
+        - __x_train, x_test__: uint8 array of RGB image data with shape (num_samples, 3, 32, 32) or (num_samples, 32, 32, 3) based on the `image_data_format` backend setting of either `channels_first` or `channels_last` respectively.
         - __y_train, y_test__: uint8 array of category labels with shape (num_samples,).
 
 - __Arguments:__
@@ -206,4 +206,4 @@ from keras.datasets import boston_housing
     - __test_split__: fraction of the data to reserve as test set.
 
 - __Returns:__
-    Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
\ No newline at end of file
+    Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
diff --git a/docs/templates/getting-started/faq.md b/docs/templates/getting-started/faq.md
index 87dabda1f70..d20a5fd1e5d 100644
--- a/docs/templates/getting-started/faq.md
+++ b/docs/templates/getting-started/faq.md
@@ -19,7 +19,7 @@
 - [How can I use HDF5 inputs with Keras?](#how-can-i-use-hdf5-inputs-with-keras)
 - [Where is the Keras configuration file stored?](#where-is-the-keras-configuration-file-stored)
 - [How can I obtain reproducible results using Keras during development?](#how-can-i-obtain-reproducible-results-using-keras-during-development)
-- [How can I install HDF5 or h5py to save my models in Keras?](#how-can-i-install-HDF5-or-h5py-to-save-my-models-in-Keras)
+- [How can I install HDF5 or h5py to save my models in Keras?](#how-can-i-install-hdf5-or-h5py-to-save-my-models-in-keras)
 
 ---
 
@@ -149,7 +149,7 @@ You can then use `keras.models.load_model(filepath)` to reinstantiate your model
 `load_model` will also take care of compiling the model using the saved training configuration
 (unless the model was never compiled in the first place).
 
-Please also see [How can I install HDF5 or h5py to save my models in Keras?](#how-can-i-install-HDF5-or-h5py-to-save-my-models-in-Keras) for instructions on how to install `h5py`.
+Please also see [How can I install HDF5 or h5py to save my models in Keras?](#how-can-i-install-hdf5-or-h5py-to-save-my-models-in-keras) for instructions on how to install `h5py`.
 
 Example:
 
@@ -210,7 +210,7 @@ If you need to load weights into a *different* architecture (with some layers in
 model.load_weights('my_model_weights.h5', by_name=True)
 ```
 
-Please also see [How can I install HDF5 or h5py to save my models in Keras?](#how-can-i-install-HDF5-or-h5py-to-save-my-models-in-Keras) for instructions on how to install `h5py`.
+Please also see [How can I install HDF5 or h5py to save my models in Keras?](#how-can-i-install-hdf5-or-h5py-to-save-my-models-in-keras) for instructions on how to install `h5py`.
 
 For example:
 
@@ -517,7 +517,7 @@ with h5py.File('input/file.hdf5', 'r') as f:
     model.predict(x_data)
 ```
 
-Please also see [How can I install HDF5 or h5py to save my models in Keras?](#how-can-i-install-HDF5-or-h5py-to-save-my-models-in-Keras) for instructions on how to install `h5py`.
+Please also see [How can I install HDF5 or h5py to save my models in Keras?](#how-can-i-install-hdf5-or-h5py-to-save-my-models-in-keras) for instructions on how to install `h5py`.
 
 ---
 
diff --git a/docs/templates/layers/writing-your-own-keras-layers.md b/docs/templates/layers/writing-your-own-keras-layers.md
index 3d608d0cb07..90108630b09 100644
--- a/docs/templates/layers/writing-your-own-keras-layers.md
+++ b/docs/templates/layers/writing-your-own-keras-layers.md
@@ -11,7 +11,6 @@ Here is the skeleton of a Keras layer, **as of Keras 2.0** (if you have an older
 ```python
 from keras import backend as K
 from keras.engine.topology import Layer
-import numpy as np
 
 class MyLayer(Layer):
 
@@ -34,4 +33,36 @@ class MyLayer(Layer):
         return (input_shape[0], self.output_dim)
 ```
 
+It is also possible to define Keras layers which have multiple input tensors and multiple ouput tensors. To do this, you should assume that the inputs and outputs of the methods `build(input_shape)`, `call(x)` and `compute_output_shape(input_shape)` are lists. Here is an example, similar to the one above:
+
+```python
+from keras import backend as K
+from keras.engine.topology import Layer
+
+class MyLayer(Layer):
+
+    def __init__(self, output_dim, **kwargs):
+        self.output_dim = output_dim
+        super(MyLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        assert isinstance(input_shape, list)
+        # Create a trainable weight variable for this layer.
+        self.kernel = self.add_weight(name='kernel',
+                                      shape=(input_shape[0][1], self.output_dim),
+                                      initializer='uniform',
+                                      trainable=True)
+        super(MyLayer, self).build(input_shape)  # Be sure to call this at the end
+
+    def call(self, x):
+        assert isinstance(x, list)
+        a, b = x
+        return [K.dot(a, self.kernel) + b, K.mean(b, axis=-1)]
+
+    def compute_output_shape(self, input_shape):
+        assert isinstance(input_shape, list)
+        shape_a, shape_b = input_shape
+        return [(shape_a[0], self.output_dim), shape_b[:-1]]
+```
+
 The existing Keras layers provide examples of how to implement almost anything. Never hesitate to read the source code!
diff --git a/docs/templates/visualization.md b/docs/templates/visualization.md
index 32f886d5143..118f939e500 100644
--- a/docs/templates/visualization.md
+++ b/docs/templates/visualization.md
@@ -23,3 +23,31 @@ from keras.utils.vis_utils import model_to_dot
 
 SVG(model_to_dot(model).create(prog='dot', format='svg'))
 ```
+
+## Training history visualization
+
+The `fit()` method on a Keras `Model` returns a `History` object. The `History.history` attribute is a dictionary recording training loss values and metrics values at successive epochs, as well as validation loss values and validation metrics values (if applicable). Here is a simple example using `matplotlib` to generate loss & accuracy plots for training & validation:
+
+```python
+import matplotlib.pyplot as plt
+
+history = model.fit(x, y, validation_split=0.25, epochs=50, batch_size=16, verbose=1)
+
+# Plot training & validation accuracy values
+plt.plot(history.history['acc'])
+plt.plot(history.history['val_acc'])
+plt.title('Model accuracy')
+plt.ylabel('Accuracy')
+plt.xlabel('Epoch')
+plt.legend(['Train', 'Test'], loc='upper left')
+plt.show()
+
+# Plot training & validation loss values
+plt.plot(history.history['loss'])
+plt.plot(history.history['val_loss'])
+plt.title('Model loss')
+plt.ylabel('Loss')
+plt.xlabel('Epoch')
+plt.legend(['Train', 'Test'], loc='upper left')
+plt.show()
+```
diff --git a/docs/templates/why-use-keras.md b/docs/templates/why-use-keras.md
index 8a5e46f5021..57b6502c204 100644
--- a/docs/templates/why-use-keras.md
+++ b/docs/templates/why-use-keras.md
@@ -14,15 +14,18 @@ There are countless deep learning frameworks available today. Why use Keras rath
 
 ## Keras has broad adoption in the industry and the research community
 
-With over 200,000 individual users as of November 2017, Keras has stronger adoption in both the industry and the research community than any other deep learning framework except TensorFlow itself (and Keras is commonly used in conjunction with TensorFlow).
+<a href='https://towardsdatascience.com/deep-learning-framework-power-scores-2018-23607ddf297a'>
+    <img style='width: 80%; margin-left: 10%;' src='https://s3.amazonaws.com/keras.io/img/dl_frameworks_power_scores.png'/>
+</a>
+<p style='font-style: italic; font-size: 10pt; text-align: center;'>
+    Deep learning frameworks ranking computed by Jeff Hale, based on 11 data sources across 7 categories
+</i>
 
-You are already constantly interacting with features built with Keras -- it is in use at Netflix, Uber, Yelp, Instacart, Zocdoc, Square, and many others. It is especially popular among startups that place deep learning at the core of their products.
-
-Keras is also a favorite among deep learning researchers, coming in #2 in terms of mentions in scientific papers uploaded to the preprint server [arXiv.org](https://arxiv.org/archive/cs):
+With over 250,000 individual users as of mid-2018, Keras has stronger adoption in both the industry and the research community than any other deep learning framework except TensorFlow itself (and the Keras API is the official frontend of TensorFlow, via the `tf.keras` module).
 
-<img src='/img/arxiv-mentions.png' style='width:500px; display: block; margin: 0 auto;'/>
+You are already constantly interacting with features built with Keras -- it is in use at Netflix, Uber, Yelp, Instacart, Zocdoc, Square, and many others. It is especially popular among startups that place deep learning at the core of their products.
 
-Keras has also been adopted by researchers at large scientific organizations, in particular CERN and NASA.
+Keras is also a favorite among deep learning researchers, coming in #2 in terms of mentions in scientific papers uploaded to the preprint server [arXiv.org](https://arxiv.org/archive/cs). Keras has also been adopted by researchers at large scientific organizations, in particular CERN and NASA.
 
 ---
 
diff --git a/examples/addition_rnn.py b/examples/addition_rnn.py
index f0167d6e55f..bcc58d4e381 100644
--- a/examples/addition_rnn.py
+++ b/examples/addition_rnn.py
@@ -35,8 +35,8 @@
 
 class CharacterTable(object):
     """Given a set of characters:
-    + Encode them to a one hot integer representation
-    + Decode the one hot integer representation to their character output
+    + Encode them to a one-hot integer representation
+    + Decode the one-hot or integer representation to their character output
     + Decode a vector of probabilities to their character output
     """
     def __init__(self, chars):
@@ -50,10 +50,11 @@ def __init__(self, chars):
         self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
 
     def encode(self, C, num_rows):
-        """One hot encode given string C.
+        """One-hot encode given string C.
 
         # Arguments
-            num_rows: Number of rows in the returned one hot encoding. This is
+            C: string, to be encoded.
+            num_rows: Number of rows in the returned one-hot encoding. This is
                 used to keep the # of rows for each data the same.
         """
         x = np.zeros((num_rows, len(self.chars)))
@@ -62,6 +63,14 @@ def encode(self, C, num_rows):
         return x
 
     def decode(self, x, calc_argmax=True):
+        """Decode the given vector or 2D array to their character output.
+
+        # Arguments
+            x: A vector or a 2D array of probabilities or one-hot representations;
+                or a vector of character indices (used with `calc_argmax=False`).
+            calc_argmax: Whether to find the character index with maximum
+                probability, defaults to `True`.
+        """
         if calc_argmax:
             x = x.argmax(axis=-1)
         return ''.join(self.indices_char[x] for x in x)
@@ -153,7 +162,7 @@ class colors:
 # Note: In a situation where your input sequences have a variable length,
 # use input_shape=(None, num_feature).
 model.add(RNN(HIDDEN_SIZE, input_shape=(MAXLEN, len(chars))))
-# As the decoder RNN's input, repeatedly provide with the last hidden state of
+# As the decoder RNN's input, repeatedly provide with the last output of
 # RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
 # length of output, e.g., when DIGITS=3, max output is 999+999=1998.
 model.add(layers.RepeatVector(DIGITS + 1))
@@ -167,8 +176,7 @@ class colors:
 
 # Apply a dense layer to the every temporal slice of an input. For each of step
 # of the output sequence, decide which character should be chosen.
-model.add(layers.TimeDistributed(layers.Dense(len(chars))))
-model.add(layers.Activation('softmax'))
+model.add(layers.TimeDistributed(layers.Dense(len(chars), activation='softmax')))
 model.compile(loss='categorical_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])
diff --git a/examples/cifar10_cnn_capsule.py b/examples/cifar10_cnn_capsule.py
index 56b04d91272..15483caf767 100644
--- a/examples/cifar10_cnn_capsule.py
+++ b/examples/cifar10_cnn_capsule.py
@@ -6,10 +6,10 @@
 
 With Data Augmentation:
 It gets to 75% validation accuracy in 10 epochs,
-and 79% after 15 epochs, and 83% after 30 epcohs.
-In my test, highest validation accuracy is 83.79% after 50 epcohs.
+and 79% after 15 epochs, and 83% after 30 epochs.
+In my test, highest validation accuracy is 83.79% after 50 epochs.
 
-This is a fast Implement, just 20s/epcoh with a gtx 1070 gpu.
+This is a fast Implement, just 20s/epoch with a gtx 1070 gpu.
 """
 
 from __future__ import print_function
diff --git a/examples/conv_filter_visualization.py b/examples/conv_filter_visualization.py
index 6c54d264613..6c3a03521a4 100644
--- a/examples/conv_filter_visualization.py
+++ b/examples/conv_filter_visualization.py
@@ -133,8 +133,11 @@ def normalize(x):
 for i in range(n):
     for j in range(n):
         img, loss = kept_filters[i * n + j]
-        stitched_filters[(img_width + margin) * i: (img_width + margin) * i + img_width,
-                         (img_height + margin) * j: (img_height + margin) * j + img_height, :] = img
+        width_margin = (img_width + margin) * i
+        height_margin = (img_height + margin) * j
+        stitched_filters[
+            width_margin: width_margin + img_width,
+            height_margin: height_margin + img_height, :] = img
 
 # save the result to disk
 save_img('stitched_filters_%dx%d.png' % (n, n), stitched_filters)
diff --git a/examples/deep_dream.py b/examples/deep_dream.py
index d129dedb493..918837f3206 100644
--- a/examples/deep_dream.py
+++ b/examples/deep_dream.py
@@ -85,7 +85,8 @@ def deprocess_image(x):
 loss = K.variable(0.)
 for layer_name in settings['features']:
     # Add the L2 norm of the features of a layer to the loss.
-    assert layer_name in layer_dict.keys(), 'Layer ' + layer_name + ' not found in model.'
+    assert (layer_name in layer_dict.keys(),
+            'Layer ' + layer_name + ' not found in model.')
     coeff = settings['features'][layer_name]
     x = layer_dict[layer_name].output
     # We avoid border artifacts by only involving non-border pixels in the loss.
diff --git a/examples/image_ocr.py b/examples/image_ocr.py
index b119673824e..49bf290d1dd 100644
--- a/examples/image_ocr.py
+++ b/examples/image_ocr.py
@@ -88,16 +88,23 @@ def paint_text(text, w, h, rotate=False, ud=False, multi_fonts=False):
         context.paint()
         # this font list works in CentOS 7
         if multi_fonts:
-            fonts = ['Century Schoolbook', 'Courier', 'STIX', 'URW Chancery L', 'FreeMono']
-            context.select_font_face(np.random.choice(fonts), cairo.FONT_SLANT_NORMAL,
-                                     np.random.choice([cairo.FONT_WEIGHT_BOLD, cairo.FONT_WEIGHT_NORMAL]))
+            fonts = [
+                'Century Schoolbook', 'Courier', 'STIX',
+                'URW Chancery L', 'FreeMono']
+            context.select_font_face(
+                np.random.choice(fonts),
+                cairo.FONT_SLANT_NORMAL,
+                np.random.choice([cairo.FONT_WEIGHT_BOLD, cairo.FONT_WEIGHT_NORMAL]))
         else:
-            context.select_font_face('Courier', cairo.FONT_SLANT_NORMAL, cairo.FONT_WEIGHT_BOLD)
+            context.select_font_face('Courier',
+                                     cairo.FONT_SLANT_NORMAL,
+                                     cairo.FONT_WEIGHT_BOLD)
         context.set_font_size(25)
         box = context.text_extents(text)
         border_w_h = (4, 4)
         if box[2] > (w - 2 * border_w_h[1]) or box[3] > (h - 2 * border_w_h[0]):
-            raise IOError('Could not fit string into image. Max char count is too large for given image width.')
+            raise IOError(('Could not fit string into image.'
+                           'Max char count is too large for given image width.'))
 
         # teach the RNN translational invariance by
         # fitting text box randomly on canvas, with some room to rotate
@@ -211,13 +218,18 @@ def build_word_list(self, num_words, max_string_len=None, mono_fraction=0.5):
         self.X_text = []
         self.Y_len = [0] * self.num_words
 
+        def _is_length_of_word_valid(word):
+            return (max_string_len == -1 or
+                    max_string_len is None or
+                    len(word) <= max_string_len)
+
         # monogram file is sorted by frequency in english speech
         with codecs.open(self.monogram_file, mode='r', encoding='utf-8') as f:
             for line in f:
                 if len(tmp_string_list) == int(self.num_words * mono_fraction):
                     break
                 word = line.rstrip()
-                if max_string_len == -1 or max_string_len is None or len(word) <= max_string_len:
+                if _is_length_of_word_valid(word):
                     tmp_string_list.append(word)
 
         # bigram file contains common word pairings in english speech
@@ -228,11 +240,11 @@ def build_word_list(self, num_words, max_string_len=None, mono_fraction=0.5):
                     break
                 columns = line.lower().split()
                 word = columns[0] + ' ' + columns[1]
-                if is_valid_str(word) and \
-                        (max_string_len == -1 or max_string_len is None or len(word) <= max_string_len):
+                if is_valid_str(word) and _is_length_of_word_valid(word):
                     tmp_string_list.append(word)
         if len(tmp_string_list) != self.num_words:
-            raise IOError('Could not pull enough words from supplied monogram and bigram files. ')
+            raise IOError('Could not pull enough words'
+                          'from supplied monogram and bigram files.')
         # interlace to mix up the easy and hard words
         self.string_list[::2] = tmp_string_list[:self.num_words // 2]
         self.string_list[1::2] = tmp_string_list[self.num_words // 2:]
@@ -274,9 +286,11 @@ def get_batch(self, index, size, train):
                 source_str.append('')
             else:
                 if K.image_data_format() == 'channels_first':
-                    X_data[i, 0, 0:self.img_w, :] = self.paint_func(self.X_text[index + i])[0, :, :].T
+                    X_data[i, 0, 0:self.img_w, :] = (
+                        self.paint_func(self.X_text[index + i])[0, :, :].T)
                 else:
-                    X_data[i, 0:self.img_w, :, 0] = self.paint_func(self.X_text[index + i])[0, :, :].T
+                    X_data[i, 0:self.img_w, :, 0] = (
+                        self.paint_func(self.X_text[index + i])[0, :, :].T)
                 labels[i, :] = self.Y_data[index + i]
                 input_length[i] = self.img_w // self.downsample_factor - 2
                 label_length[i] = self.Y_len[index + i]
@@ -292,7 +306,8 @@ def get_batch(self, index, size, train):
 
     def next_train(self):
         while 1:
-            ret = self.get_batch(self.cur_train_index, self.minibatch_size, train=True)
+            ret = self.get_batch(self.cur_train_index,
+                                 self.minibatch_size, train=True)
             self.cur_train_index += self.minibatch_size
             if self.cur_train_index >= self.val_split:
                 self.cur_train_index = self.cur_train_index % 32
@@ -302,7 +317,8 @@ def next_train(self):
 
     def next_val(self):
         while 1:
-            ret = self.get_batch(self.cur_val_index, self.minibatch_size, train=False)
+            ret = self.get_batch(self.cur_val_index,
+                                 self.minibatch_size, train=False)
             self.cur_val_index += self.minibatch_size
             if self.cur_val_index >= self.num_words:
                 self.cur_val_index = self.val_split + self.cur_val_index % 32
@@ -310,20 +326,24 @@ def next_val(self):
 
     def on_train_begin(self, logs={}):
         self.build_word_list(16000, 4, 1)
-        self.paint_func = lambda text: paint_text(text, self.img_w, self.img_h,
-                                                  rotate=False, ud=False, multi_fonts=False)
+        self.paint_func = lambda text: paint_text(
+            text, self.img_w, self.img_h,
+            rotate=False, ud=False, multi_fonts=False)
 
     def on_epoch_begin(self, epoch, logs={}):
         # rebind the paint function to implement curriculum learning
         if 3 <= epoch < 6:
-            self.paint_func = lambda text: paint_text(text, self.img_w, self.img_h,
-                                                      rotate=False, ud=True, multi_fonts=False)
+            self.paint_func = lambda text: paint_text(
+                text, self.img_w, self.img_h,
+                rotate=False, ud=True, multi_fonts=False)
         elif 6 <= epoch < 9:
-            self.paint_func = lambda text: paint_text(text, self.img_w, self.img_h,
-                                                      rotate=False, ud=True, multi_fonts=True)
+            self.paint_func = lambda text: paint_text(
+                text, self.img_w, self.img_h,
+                rotate=False, ud=True, multi_fonts=True)
         elif epoch >= 9:
-            self.paint_func = lambda text: paint_text(text, self.img_w, self.img_h,
-                                                      rotate=True, ud=True, multi_fonts=True)
+            self.paint_func = lambda text: paint_text(
+                text, self.img_w, self.img_h,
+                rotate=True, ud=True, multi_fonts=True)
         if epoch >= 21 and self.max_string_len < 12:
             self.build_word_list(32000, 12, 0.5)
 
@@ -371,22 +391,27 @@ def show_edit_distance(self, num):
         while num_left > 0:
             word_batch = next(self.text_img_gen)[0]
             num_proc = min(word_batch['the_input'].shape[0], num_left)
-            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
+            decoded_res = decode_batch(self.test_func,
+                                       word_batch['the_input'][0:num_proc])
             for j in range(num_proc):
-                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
+                edit_dist = editdistance.eval(decoded_res[j],
+                                              word_batch['source_str'][j])
                 mean_ed += float(edit_dist)
                 mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
             num_left -= num_proc
         mean_norm_ed = mean_norm_ed / num
         mean_ed = mean_ed / num
-        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
+        print('\nOut of %d samples:  Mean edit distance:'
+              '%.3f Mean normalized edit distance: %0.3f'
               % (num, mean_ed, mean_norm_ed))
 
     def on_epoch_end(self, epoch, logs={}):
-        self.model.save_weights(os.path.join(self.output_dir, 'weights%02d.h5' % (epoch)))
+        self.model.save_weights(
+            os.path.join(self.output_dir, 'weights%02d.h5' % (epoch)))
         self.show_edit_distance(256)
         word_batch = next(self.text_img_gen)[0]
-        res = decode_batch(self.test_func, word_batch['the_input'][0:self.num_display_words])
+        res = decode_batch(self.test_func,
+                           word_batch['the_input'][0:self.num_display_words])
         if word_batch['the_input'][0].shape[0] < 256:
             cols = 2
         else:
@@ -398,7 +423,9 @@ def on_epoch_end(self, epoch, logs={}):
             else:
                 the_input = word_batch['the_input'][i, :, :, 0]
             pylab.imshow(the_input.T, cmap='Greys_r')
-            pylab.xlabel('Truth = \'%s\'\nDecoded = \'%s\'' % (word_batch['source_str'][i], res[i]))
+            pylab.xlabel(
+                'Truth = \'%s\'\nDecoded = \'%s\'' %
+                (word_batch['source_str'][i], res[i]))
         fig = pylab.gcf()
         fig.set_size_inches(10, 13)
         pylab.savefig(os.path.join(self.output_dir, 'e%02d.png' % (epoch)))
@@ -425,17 +452,19 @@ def train(run_name, start_epoch, stop_epoch, img_w):
     else:
         input_shape = (img_w, img_h, 1)
 
-    fdir = os.path.dirname(get_file('wordlists.tgz',
-                                    origin='http://www.mythic-ai.com/datasets/wordlists.tgz', untar=True))
-
-    img_gen = TextImageGenerator(monogram_file=os.path.join(fdir, 'wordlist_mono_clean.txt'),
-                                 bigram_file=os.path.join(fdir, 'wordlist_bi_clean.txt'),
-                                 minibatch_size=minibatch_size,
-                                 img_w=img_w,
-                                 img_h=img_h,
-                                 downsample_factor=(pool_size ** 2),
-                                 val_split=words_per_epoch - val_words
-                                 )
+    fdir = os.path.dirname(
+        get_file('wordlists.tgz',
+                 origin='http://www.mythic-ai.com/datasets/wordlists.tgz',
+                 untar=True))
+
+    img_gen = TextImageGenerator(
+        monogram_file=os.path.join(fdir, 'wordlist_mono_clean.txt'),
+        bigram_file=os.path.join(fdir, 'wordlist_bi_clean.txt'),
+        minibatch_size=minibatch_size,
+        img_w=img_w,
+        img_h=img_h,
+        downsample_factor=(pool_size ** 2),
+        val_split=words_per_epoch - val_words)
     act = 'relu'
     input_data = Input(name='the_input', shape=input_shape, dtype='float32')
     inner = Conv2D(conv_filters, kernel_size, padding='same',
@@ -447,7 +476,8 @@ def train(run_name, start_epoch, stop_epoch, img_w):
                    name='conv2')(inner)
     inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(inner)
 
-    conv_to_rnn_dims = (img_w // (pool_size ** 2), (img_h // (pool_size ** 2)) * conv_filters)
+    conv_to_rnn_dims = (img_w // (pool_size ** 2),
+                        (img_h // (pool_size ** 2)) * conv_filters)
     inner = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(inner)
 
     # cuts down input size going into RNN:
@@ -455,11 +485,16 @@ def train(run_name, start_epoch, stop_epoch, img_w):
 
     # Two layers of bidirectional GRUs
     # GRU seems to work as well, if not better than LSTM:
-    gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(inner)
-    gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(inner)
+    gru_1 = GRU(rnn_size, return_sequences=True,
+                kernel_initializer='he_normal', name='gru1')(inner)
+    gru_1b = GRU(rnn_size, return_sequences=True,
+                 go_backwards=True, kernel_initializer='he_normal',
+                 name='gru1_b')(inner)
     gru1_merged = add([gru_1, gru_1b])
-    gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
-    gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')(gru1_merged)
+    gru_2 = GRU(rnn_size, return_sequences=True,
+                kernel_initializer='he_normal', name='gru2')(gru1_merged)
+    gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True,
+                 kernel_initializer='he_normal', name='gru2_b')(gru1_merged)
 
     # transforms RNN output to character activations:
     inner = Dense(img_gen.get_output_size(), kernel_initializer='he_normal',
@@ -467,39 +502,47 @@ def train(run_name, start_epoch, stop_epoch, img_w):
     y_pred = Activation('softmax', name='softmax')(inner)
     Model(inputs=input_data, outputs=y_pred).summary()
 
-    labels = Input(name='the_labels', shape=[img_gen.absolute_max_string_len], dtype='float32')
+    labels = Input(name='the_labels',
+                   shape=[img_gen.absolute_max_string_len], dtype='float32')
     input_length = Input(name='input_length', shape=[1], dtype='int64')
     label_length = Input(name='label_length', shape=[1], dtype='int64')
     # Keras doesn't currently support loss funcs with extra parameters
     # so CTC loss is implemented in a lambda layer
-    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
+    loss_out = Lambda(
+        ctc_lambda_func, output_shape=(1,),
+        name='ctc')([y_pred, labels, input_length, label_length])
 
     # clipnorm seems to speeds up convergence
     sgd = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
 
-    model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
+    model = Model(inputs=[input_data, labels, input_length, label_length],
+                  outputs=loss_out)
 
     # the loss calc occurs elsewhere, so use a dummy lambda func for the loss
     model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)
     if start_epoch > 0:
-        weight_file = os.path.join(OUTPUT_DIR, os.path.join(run_name, 'weights%02d.h5' % (start_epoch - 1)))
+        weight_file = os.path.join(
+            OUTPUT_DIR,
+            os.path.join(run_name, 'weights%02d.h5' % (start_epoch - 1)))
         model.load_weights(weight_file)
     # captures output of softmax so we can decode the output during visualization
     test_func = K.function([input_data], [y_pred])
 
     viz_cb = VizCallback(run_name, test_func, img_gen.next_val())
 
-    model.fit_generator(generator=img_gen.next_train(),
-                        steps_per_epoch=(words_per_epoch - val_words) // minibatch_size,
-                        epochs=stop_epoch,
-                        validation_data=img_gen.next_val(),
-                        validation_steps=val_words // minibatch_size,
-                        callbacks=[viz_cb, img_gen],
-                        initial_epoch=start_epoch)
+    model.fit_generator(
+        generator=img_gen.next_train(),
+        steps_per_epoch=(words_per_epoch - val_words) // minibatch_size,
+        epochs=stop_epoch,
+        validation_data=img_gen.next_val(),
+        validation_steps=val_words // minibatch_size,
+        callbacks=[viz_cb, img_gen],
+        initial_epoch=start_epoch)
 
 
 if __name__ == '__main__':
     run_name = datetime.datetime.now().strftime('%Y:%m:%d:%H:%M:%S')
     train(run_name, 0, 20, 128)
-    # increase to wider images and start at epoch 20. The learned weights are reloaded
+    # increase to wider images and start at epoch 20.
+    # The learned weights are reloaded
     train(run_name, 20, 25, 512)
diff --git a/examples/imdb_fasttext.py b/examples/imdb_fasttext.py
index 91653177e6d..d01f6fff190 100644
--- a/examples/imdb_fasttext.py
+++ b/examples/imdb_fasttext.py
@@ -75,8 +75,10 @@ def add_ngram(sequences, token_indice, ngram_range=2):
 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
 print(len(x_train), 'train sequences')
 print(len(x_test), 'test sequences')
-print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
-print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
+print('Average train sequence length: {}'.format(
+    np.mean(list(map(len, x_train)), dtype=int)))
+print('Average test sequence length: {}'.format(
+    np.mean(list(map(len, x_test)), dtype=int)))
 
 if ngram_range > 1:
     print('Adding {}-gram features'.format(ngram_range))
@@ -100,8 +102,10 @@ def add_ngram(sequences, token_indice, ngram_range=2):
     # Augmenting x_train and x_test with n-grams features
     x_train = add_ngram(x_train, token_indice, ngram_range)
     x_test = add_ngram(x_test, token_indice, ngram_range)
-    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
-    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
+    print('Average train sequence length: {}'.format(
+        np.mean(list(map(len, x_train)), dtype=int)))
+    print('Average test sequence length: {}'.format(
+        np.mean(list(map(len, x_test)), dtype=int)))
 
 print('Pad sequences (samples x time)')
 x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
diff --git a/examples/imdb_lstm.py b/examples/imdb_lstm.py
index 7de39087019..9d75cdc3e92 100644
--- a/examples/imdb_lstm.py
+++ b/examples/imdb_lstm.py
@@ -22,7 +22,8 @@
 from keras import backend as K
 
 max_features = 20000
-maxlen = 80  # cut texts after this number of words (among top max_features most common words)
+# cut texts after this number of words (among top max_features most common words)
+maxlen = 80
 batch_size = 32
 
 print('Loading data...')
diff --git a/examples/lstm_text_generation.py b/examples/lstm_text_generation.py
index 961850a8422..fa47d4fbfe5 100644
--- a/examples/lstm_text_generation.py
+++ b/examples/lstm_text_generation.py
@@ -13,7 +13,7 @@
 from __future__ import print_function
 from keras.callbacks import LambdaCallback
 from keras.models import Sequential
-from keras.layers import Dense, Activation
+from keras.layers import Dense
 from keras.layers import LSTM
 from keras.optimizers import RMSprop
 from keras.utils.data_utils import get_file
@@ -22,7 +22,9 @@
 import sys
 import io
 
-path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
+path = get_file(
+    'nietzsche.txt',
+    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
 with io.open(path, encoding='utf-8') as f:
     text = f.read().lower()
 print('corpus length:', len(text))
@@ -55,8 +57,7 @@
 print('Build model...')
 model = Sequential()
 model.add(LSTM(128, input_shape=(maxlen, len(chars))))
-model.add(Dense(len(chars)))
-model.add(Activation('softmax'))
+model.add(Dense(len(chars), activation='softmax'))
 
 optimizer = RMSprop(lr=0.01)
 model.compile(loss='categorical_crossentropy', optimizer=optimizer)
@@ -72,7 +73,7 @@ def sample(preds, temperature=1.0):
     return np.argmax(probas)
 
 
-def on_epoch_end(epoch, logs):
+def on_epoch_end(epoch, _):
     # Function invoked at end of each epoch. Prints generated text.
     print()
     print('----- Generating text after Epoch: %d' % epoch)
diff --git a/examples/mnist_acgan.py b/examples/mnist_acgan.py
index 93fd09597cb..8f7ebddc502 100644
--- a/examples/mnist_acgan.py
+++ b/examples/mnist_acgan.py
@@ -182,36 +182,26 @@ def build_discriminator():
     for epoch in range(1, epochs + 1):
         print('Epoch {}/{}'.format(epoch, epochs))
 
-        num_batches = int(x_train.shape[0] / batch_size)
+        num_batches = int(np.ceil(x_train.shape[0] / float(batch_size)))
         progress_bar = Progbar(target=num_batches)
 
-        # we don't want the discriminator to also maximize the classification
-        # accuracy of the auxiliary classifier on generated images, so we
-        # don't train discriminator to produce class labels for generated
-        # images (see https://openreview.net/forum?id=rJXTf9Bxg).
-        # To preserve sum of sample weights for the auxiliary classifier,
-        # we assign sample weight of 2 to the real images.
-        disc_sample_weight = [np.ones(2 * batch_size),
-                              np.concatenate((np.ones(batch_size) * 2,
-                                              np.zeros(batch_size)))]
-
         epoch_gen_loss = []
         epoch_disc_loss = []
 
         for index in range(num_batches):
-            # generate a new batch of noise
-            noise = np.random.uniform(-1, 1, (batch_size, latent_size))
-
             # get a batch of real images
             image_batch = x_train[index * batch_size:(index + 1) * batch_size]
             label_batch = y_train[index * batch_size:(index + 1) * batch_size]
 
+            # generate a new batch of noise
+            noise = np.random.uniform(-1, 1, (len(image_batch), latent_size))
+
             # sample some labels from p_c
-            sampled_labels = np.random.randint(0, num_classes, batch_size)
+            sampled_labels = np.random.randint(0, num_classes, len(image_batch))
 
             # generate a batch of fake images, using the generated labels as a
             # conditioner. We reshape the sampled labels to be
-            # (batch_size, 1) so that we can feed them into the embedding
+            # (len(image_batch), 1) so that we can feed them into the embedding
             # layer as a length one sequence
             generated_images = generator.predict(
                 [noise, sampled_labels.reshape((-1, 1))], verbose=0)
@@ -222,9 +212,20 @@ def build_discriminator():
             # Salimans et al., 2016
             # https://arxiv.org/pdf/1606.03498.pdf (Section 3.4)
             soft_zero, soft_one = 0, 0.95
-            y = np.array([soft_one] * batch_size + [soft_zero] * batch_size)
+            y = np.array(
+                [soft_one] * len(image_batch) + [soft_zero] * len(image_batch))
             aux_y = np.concatenate((label_batch, sampled_labels), axis=0)
 
+            # we don't want the discriminator to also maximize the classification
+            # accuracy of the auxilary classifier on generated images, so we
+            # don't train discriminator to produce class labels for generated
+            # images (see https://openreview.net/forum?id=rJXTf9Bxg).
+            # To preserve sum of sample weights for the auxilary classifier,
+            # we assign sample weight of 2 to the real images.
+            disc_sample_weight = [np.ones(2 * len(image_batch)),
+                                  np.concatenate((np.ones(len(image_batch)) * 2,
+                                                  np.zeros(len(image_batch))))]
+
             # see if the discriminator can figure itself out...
             epoch_disc_loss.append(discriminator.train_on_batch(
                 x, [y, aux_y], sample_weight=disc_sample_weight))
@@ -232,13 +233,13 @@ def build_discriminator():
             # make new noise. we generate 2 * batch size here such that we have
             # the generator optimize over an identical number of images as the
             # discriminator
-            noise = np.random.uniform(-1, 1, (2 * batch_size, latent_size))
-            sampled_labels = np.random.randint(0, num_classes, 2 * batch_size)
+            noise = np.random.uniform(-1, 1, (2 * len(image_batch), latent_size))
+            sampled_labels = np.random.randint(0, num_classes, 2 * len(image_batch))
 
             # we want to train the generator to trick the discriminator
             # For the generator, we want all the {fake, not-fake} labels to say
             # not-fake
-            trick = np.ones(2 * batch_size) * soft_one
+            trick = np.ones(2 * len(image_batch)) * soft_one
 
             epoch_gen_loss.append(combined.train_on_batch(
                 [noise, sampled_labels.reshape((-1, 1))],
diff --git a/examples/mnist_hierarchical_rnn.py b/examples/mnist_hierarchical_rnn.py
index 8fe4b4d7a8e..84cae8204dd 100644
--- a/examples/mnist_hierarchical_rnn.py
+++ b/examples/mnist_hierarchical_rnn.py
@@ -12,12 +12,14 @@
 
 # References
 
-- [A Hierarchical Neural Autoencoder for Paragraphs and Documents](https://arxiv.org/abs/1506.01057)
+- [A Hierarchical Neural Autoencoder for Paragraphs and Documents]
+    (https://arxiv.org/abs/1506.01057)
     Encodes paragraphs and documents with HRNN.
     Results have shown that HRNN outperforms standard
     RNNs and may play some role in more sophisticated generation tasks like
     summarization or question answering.
-- [Hierarchical recurrent neural network for skeleton based action recognition](http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7298714)
+- [Hierarchical recurrent neural network for skeleton based action recognition]
+    (http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7298714)
     Achieved state-of-the-art results on
     skeleton based action recognition with 3 levels
     of bidirectional HRNN combined with fully connected layers.
diff --git a/examples/mnist_net2net.py b/examples/mnist_net2net.py
index 77a16b6a0a8..bf34fd04fee 100644
--- a/examples/mnist_net2net.py
+++ b/examples/mnist_net2net.py
@@ -123,8 +123,9 @@ def wider2net_conv2d(teacher_w1, teacher_b1, teacher_w2, new_width, init):
     if init == 'random-pad':
         new_w1 = np.random.normal(0, 0.1, size=teacher_w1.shape[:3] + (n,))
         new_b1 = np.ones(n) * 0.1
-        new_w2 = np.random.normal(0, 0.1,
-                                  size=teacher_w2.shape[:2] + (n, teacher_w2.shape[3]))
+        new_w2 = np.random.normal(
+            0, 0.1,
+            size=teacher_w2.shape[:2] + (n, teacher_w2.shape[3]))
     elif init == 'net2wider':
         index = np.random.randint(teacher_w1.shape[3], size=n)
         factors = np.bincount(index)[index] + 1.
diff --git a/examples/mnist_siamese.py b/examples/mnist_siamese.py
index 10d32f9bc62..3c5f2e30cd2 100644
--- a/examples/mnist_siamese.py
+++ b/examples/mnist_siamese.py
@@ -29,7 +29,8 @@
 
 def euclidean_distance(vects):
     x, y = vects
-    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
+    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
+    return K.sqrt(K.maximum(sum_square, K.epsilon()))
 
 
 def eucl_dist_output_shape(shapes):
@@ -42,8 +43,9 @@ def contrastive_loss(y_true, y_pred):
     http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
     '''
     margin = 1
-    return K.mean(y_true * K.square(y_pred) +
-                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
+    sqaure_pred = K.square(y_pred)
+    margin_square = K.square(K.maximum(margin - y_pred, 0))
+    return K.mean(y_true * sqaure_pred + (1 - y_true) * margin_square)
 
 
 def create_pairs(x, digit_indices):
diff --git a/examples/mnist_sklearn_wrapper.py b/examples/mnist_sklearn_wrapper.py
index 51001065e6e..75bef0ec409 100644
--- a/examples/mnist_sklearn_wrapper.py
+++ b/examples/mnist_sklearn_wrapper.py
@@ -12,7 +12,7 @@
 from keras.layers import Conv2D, MaxPooling2D
 from keras.wrappers.scikit_learn import KerasClassifier
 from keras import backend as K
-from sklearn.grid_search import GridSearchCV
+from sklearn.model_selection import GridSearchCV
 
 
 num_classes = 10
diff --git a/examples/mnist_tfrecord.py b/examples/mnist_tfrecord.py
index a8cc93ea194..d2b89e16674 100644
--- a/examples/mnist_tfrecord.py
+++ b/examples/mnist_tfrecord.py
@@ -206,9 +206,10 @@ def cnn_layers(x_train_input):
 coord = tf.train.Coordinator()
 threads = tf.train.start_queue_runners(sess, coord)
 
-train_model.fit(epochs=epochs,
-                steps_per_epoch=int(np.ceil(data.train.num_examples / float(batch_size))),
-                callbacks=[EvaluateInputTensor(test_model, steps=100)])
+train_model.fit(
+    epochs=epochs,
+    steps_per_epoch=int(np.ceil(data.train.num_examples / float(batch_size))),
+    callbacks=[EvaluateInputTensor(test_model, steps=100)])
 
 # Save the model weights.
 train_model.save_weights('saved_wt.h5')
diff --git a/examples/neural_doodle.py b/examples/neural_doodle.py
index db893f5bfa5..6e8fa3b544f 100644
--- a/examples/neural_doodle.py
+++ b/examples/neural_doodle.py
@@ -31,11 +31,16 @@
 
 # References
 
-- [Dmitry Ulyanov's blog on fast-neural-doodle](http://dmitryulyanov.github.io/feed-forward-neural-doodle/)
-- [Torch code for fast-neural-doodle](https://github.com/DmitryUlyanov/fast-neural-doodle)
-- [Torch code for online-neural-doodle](https://github.com/DmitryUlyanov/online-neural-doodle)
-- [Paper Texture Networks: Feed-forward Synthesis of Textures and Stylized Images](http://arxiv.org/abs/1603.03417)
-- [Discussion on parameter tuning](https://github.com/keras-team/keras/issues/3705)
+- [Dmitry Ulyanov's blog on fast-neural-doodle]
+    (http://dmitryulyanov.github.io/feed-forward-neural-doodle/)
+- [Torch code for fast-neural-doodle]
+    (https://github.com/DmitryUlyanov/fast-neural-doodle)
+- [Torch code for online-neural-doodle]
+    (https://github.com/DmitryUlyanov/online-neural-doodle)
+- [Paper Texture Networks: Feed-forward Synthesis of Textures and Stylized Images]
+    (http://arxiv.org/abs/1603.03417)
+- [Discussion on parameter tuning]
+    (https://github.com/keras-team/keras/issues/3705)
 
 # Resources
 
@@ -137,7 +142,8 @@ def kmeans(xs, k):
 def load_mask_labels():
     '''Load both target and style masks.
     A mask image (nr x nc) with m labels/colors will be loaded
-    as a 4D boolean tensor: (1, m, nr, nc) for 'channels_first' or (1, nr, nc, m) for 'channels_last'
+    as a 4D boolean tensor:
+        (1, m, nr, nc) for 'channels_first' or (1, nr, nc, m) for 'channels_last'
     '''
     target_mask_img = load_img(target_mask_path,
                                target_size=(img_nrows, img_ncols))
diff --git a/examples/neural_style_transfer.py b/examples/neural_style_transfer.py
index c16c873c3c6..e2d66e055f4 100644
--- a/examples/neural_style_transfer.py
+++ b/examples/neural_style_transfer.py
@@ -2,15 +2,18 @@
 
 Run the script with:
 ```
-python neural_style_transfer.py path_to_your_base_image.jpg path_to_your_reference.jpg prefix_for_results
+python neural_style_transfer.py path_to_your_base_image.jpg \
+    path_to_your_reference.jpg prefix_for_results
 ```
 e.g.:
 ```
-python neural_style_transfer.py img/tuebingen.jpg img/starry_night.jpg results/my_result
+python neural_style_transfer.py img/tuebingen.jpg \
+    img/starry_night.jpg results/my_result
 ```
 Optional parameters:
 ```
---iter, To specify the number of iterations the style transfer takes place (Default is 10)
+--iter, To specify the number of iterations \
+    the style transfer takes place (Default is 10)
 --content_weight, The weight given to the content loss (Default is 0.025)
 --style_weight, The weight given to the style loss (Default is 1.0)
 --tv_weight, The weight given to the total variation loss (Default is 1.0)
@@ -192,11 +195,15 @@ def content_loss(base, combination):
 def total_variation_loss(x):
     assert K.ndim(x) == 4
     if K.image_data_format() == 'channels_first':
-        a = K.square(x[:, :, :img_nrows - 1, :img_ncols - 1] - x[:, :, 1:, :img_ncols - 1])
-        b = K.square(x[:, :, :img_nrows - 1, :img_ncols - 1] - x[:, :, :img_nrows - 1, 1:])
+        a = K.square(
+            x[:, :, :img_nrows - 1, :img_ncols - 1] - x[:, :, 1:, :img_ncols - 1])
+        b = K.square(
+            x[:, :, :img_nrows - 1, :img_ncols - 1] - x[:, :, :img_nrows - 1, 1:])
     else:
-        a = K.square(x[:, :img_nrows - 1, :img_ncols - 1, :] - x[:, 1:, :img_ncols - 1, :])
-        b = K.square(x[:, :img_nrows - 1, :img_ncols - 1, :] - x[:, :img_nrows - 1, 1:, :])
+        a = K.square(
+            x[:, :img_nrows - 1, :img_ncols - 1, :] - x[:, 1:, :img_ncols - 1, :])
+        b = K.square(
+            x[:, :img_nrows - 1, :img_ncols - 1, :] - x[:, :img_nrows - 1, 1:, :])
     return K.sum(K.pow(a + b, 1.25))
 
 # combine these loss functions into a single scalar
diff --git a/examples/pretrained_word_embeddings.py b/examples/pretrained_word_embeddings.py
index c3ac0a5e661..f5a004a2287 100644
--- a/examples/pretrained_word_embeddings.py
+++ b/examples/pretrained_word_embeddings.py
@@ -103,10 +103,10 @@
 print('Preparing embedding matrix.')
 
 # prepare embedding matrix
-num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
+num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
 embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
 for word, i in word_index.items():
-    if i >= MAX_NUM_WORDS:
+    if i > MAX_NUM_WORDS:
         continue
     embedding_vector = embeddings_index.get(word)
     if embedding_vector is not None:
diff --git a/examples/variational_autoencoder.py b/examples/variational_autoencoder.py
index 5d771e79235..db3869c91b3 100644
--- a/examples/variational_autoencoder.py
+++ b/examples/variational_autoencoder.py
@@ -196,7 +196,7 @@ def plot_results(models,
                show_shapes=True)
 
     if args.weights:
-        vae = vae.load_weights(args.weights)
+        vae.load_weights(args.weights)
     else:
         # train the autoencoder
         vae.fit(x_train,
diff --git a/examples/variational_autoencoder_deconv.py b/examples/variational_autoencoder_deconv.py
index acb0d3d6337..1e540d219a5 100644
--- a/examples/variational_autoencoder_deconv.py
+++ b/examples/variational_autoencoder_deconv.py
@@ -222,7 +222,7 @@ def plot_results(models,
     plot_model(vae, to_file='vae_cnn.png', show_shapes=True)
 
     if args.weights:
-        vae = vae.load_weights(args.weights)
+        vae.load_weights(args.weights)
     else:
         # train the autoencoder
         vae.fit(x_train,
diff --git a/keras/__init__.py b/keras/__init__.py
index 856e0757000..de888fbafcd 100644
--- a/keras/__init__.py
+++ b/keras/__init__.py
@@ -23,4 +23,4 @@
 from .models import Model
 from .models import Sequential
 
-__version__ = '2.2.2'
+__version__ = '2.2.4'
diff --git a/keras/activations.py b/keras/activations.py
index 2cc67f4be83..54ad2831cd0 100644
--- a/keras/activations.py
+++ b/keras/activations.py
@@ -106,20 +106,26 @@ def softsign(x):
     return K.softsign(x)
 
 
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0.):
     """Rectified Linear Unit.
 
+    With default values, it returns element-wise `max(x, 0)`.
+
+    Otherwise, it follows:
+    `f(x) = max_value` for `x >= max_value`,
+    `f(x) = x` for `threshold <= x < max_value`,
+    `f(x) = alpha * (x - threshold)` otherwise.
+
     # Arguments
         x: Input tensor.
-        alpha: Slope of the negative part. Defaults to zero.
-        max_value: Maximum value for the output.
+        alpha: float. Slope of the negative part. Defaults to zero.
+        max_value: float. Saturation threshold.
+        threshold: float. Threshold value for thresholded activation.
 
     # Returns
-        The (leaky) rectified linear unit activation: `x` if `x > 0`,
-        `alpha * x` if `x < 0`. If `max_value` is defined, the result
-        is truncated to this value.
+        A tensor.
     """
-    return K.relu(x, alpha=alpha, max_value=max_value)
+    return K.relu(x, alpha=alpha, max_value=max_value, threshold=threshold)
 
 
 def tanh(x):
@@ -152,6 +158,12 @@ def hard_sigmoid(x):
     return K.hard_sigmoid(x)
 
 
+def exponential(x):
+    """Exponential (base e) activation function.
+    """
+    return K.exp(x)
+
+
 def linear(x):
     """Linear (i.e. identity) activation function.
     """
diff --git a/keras/applications/__init__.py b/keras/applications/__init__.py
index a2c2840bde6..a250e4684cb 100644
--- a/keras/applications/__init__.py
+++ b/keras/applications/__init__.py
@@ -3,18 +3,32 @@
 from __future__ import print_function
 
 from .. import backend
-from .. import engine
 from .. import layers
 from .. import models
 from .. import utils
 
 import keras_applications
 
-keras_applications.set_keras_submodules(
-    backend=backend,
-    layers=layers,
-    models=models,
-    utils=utils)
+if not hasattr(keras_applications, 'get_submodules_from_kwargs'):
+    keras_applications.set_keras_submodules(
+        backend=backend,
+        layers=layers,
+        models=models,
+        utils=utils)
+
+
+def keras_modules_injection(base_fun):
+
+    def wrapper(*args, **kwargs):
+        if hasattr(keras_applications, 'get_submodules_from_kwargs'):
+            kwargs['backend'] = backend
+            kwargs['layers'] = layers
+            kwargs['models'] = models
+            kwargs['utils'] = utils
+        return base_fun(*args, **kwargs)
+
+    return wrapper
+
 
 from .vgg16 import VGG16
 from .vgg19 import VGG19
@@ -23,6 +37,6 @@
 from .inception_resnet_v2 import InceptionResNetV2
 from .xception import Xception
 from .mobilenet import MobileNet
-from .mobilenetv2 import MobileNetV2
+from .mobilenet_v2 import MobileNetV2
 from .densenet import DenseNet121, DenseNet169, DenseNet201
 from .nasnet import NASNetMobile, NASNetLarge
diff --git a/keras/applications/densenet.py b/keras/applications/densenet.py
index 8cc8a7149a9..fb3082200f3 100644
--- a/keras/applications/densenet.py
+++ b/keras/applications/densenet.py
@@ -3,9 +3,29 @@
 from __future__ import print_function
 
 from keras_applications import densenet
+from . import keras_modules_injection
 
-DenseNet121 = densenet.DenseNet121
-DenseNet169 = densenet.DenseNet169
-DenseNet201 = densenet.DenseNet201
-decode_predictions = densenet.decode_predictions
-preprocess_input = densenet.preprocess_input
+
+@keras_modules_injection
+def DenseNet121(*args, **kwargs):
+    return densenet.DenseNet121(*args, **kwargs)
+
+
+@keras_modules_injection
+def DenseNet169(*args, **kwargs):
+    return densenet.DenseNet169(*args, **kwargs)
+
+
+@keras_modules_injection
+def DenseNet201(*args, **kwargs):
+    return densenet.DenseNet201(*args, **kwargs)
+
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return densenet.decode_predictions(*args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return densenet.preprocess_input(*args, **kwargs)
diff --git a/keras/applications/imagenet_utils.py b/keras/applications/imagenet_utils.py
index d7594c348ad..41542f1c2af 100644
--- a/keras/applications/imagenet_utils.py
+++ b/keras/applications/imagenet_utils.py
@@ -5,6 +5,15 @@
 from __future__ import print_function
 
 from keras_applications import imagenet_utils
+from . import keras_modules_injection
 
-preprocess_input = imagenet_utils.preprocess_input
-decode_predictions = imagenet_utils.decode_predictions
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return imagenet_utils.decode_predictions(
+        *args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return imagenet_utils.preprocess_input(*args, **kwargs)
diff --git a/keras/applications/inception_resnet_v2.py b/keras/applications/inception_resnet_v2.py
index 7dd58cc505d..199c17e45ba 100644
--- a/keras/applications/inception_resnet_v2.py
+++ b/keras/applications/inception_resnet_v2.py
@@ -3,7 +3,19 @@
 from __future__ import print_function
 
 from keras_applications import inception_resnet_v2
+from . import keras_modules_injection
 
-InceptionResNetV2 = inception_resnet_v2.InceptionResNetV2
-decode_predictions = inception_resnet_v2.decode_predictions
-preprocess_input = inception_resnet_v2.preprocess_input
+
+@keras_modules_injection
+def InceptionResNetV2(*args, **kwargs):
+    return inception_resnet_v2.InceptionResNetV2(*args, **kwargs)
+
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return inception_resnet_v2.decode_predictions(*args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return inception_resnet_v2.preprocess_input(*args, **kwargs)
diff --git a/keras/applications/inception_v3.py b/keras/applications/inception_v3.py
index f64893e53e5..9dc1dc8c558 100644
--- a/keras/applications/inception_v3.py
+++ b/keras/applications/inception_v3.py
@@ -3,7 +3,19 @@
 from __future__ import print_function
 
 from keras_applications import inception_v3
+from . import keras_modules_injection
 
-InceptionV3 = inception_v3.InceptionV3
-decode_predictions = inception_v3.decode_predictions
-preprocess_input = inception_v3.preprocess_input
+
+@keras_modules_injection
+def InceptionV3(*args, **kwargs):
+    return inception_v3.InceptionV3(*args, **kwargs)
+
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return inception_v3.decode_predictions(*args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return inception_v3.preprocess_input(*args, **kwargs)
diff --git a/keras/applications/mobilenet.py b/keras/applications/mobilenet.py
index b4199dc198b..cdf1ff0b0b1 100644
--- a/keras/applications/mobilenet.py
+++ b/keras/applications/mobilenet.py
@@ -3,7 +3,19 @@
 from __future__ import print_function
 
 from keras_applications import mobilenet
+from . import keras_modules_injection
 
-MobileNet = mobilenet.MobileNet
-decode_predictions = mobilenet.decode_predictions
-preprocess_input = mobilenet.preprocess_input
+
+@keras_modules_injection
+def MobileNet(*args, **kwargs):
+    return mobilenet.MobileNet(*args, **kwargs)
+
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return mobilenet.decode_predictions(*args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return mobilenet.preprocess_input(*args, **kwargs)
diff --git a/keras/applications/mobilenet_v2.py b/keras/applications/mobilenet_v2.py
new file mode 100644
index 00000000000..c6879de71c7
--- /dev/null
+++ b/keras/applications/mobilenet_v2.py
@@ -0,0 +1,21 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from keras_applications import mobilenet_v2
+from . import keras_modules_injection
+
+
+@keras_modules_injection
+def MobileNetV2(*args, **kwargs):
+    return mobilenet_v2.MobileNetV2(*args, **kwargs)
+
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return mobilenet_v2.decode_predictions(*args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return mobilenet_v2.preprocess_input(*args, **kwargs)
diff --git a/keras/applications/mobilenetv2.py b/keras/applications/mobilenetv2.py
index fd06d5fa172..e46fee8cf04 100644
--- a/keras/applications/mobilenetv2.py
+++ b/keras/applications/mobilenetv2.py
@@ -1,9 +1,2 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from keras_applications import mobilenet_v2
-
-MobileNetV2 = mobilenet_v2.MobileNetV2
-decode_predictions = mobilenet_v2.decode_predictions
-preprocess_input = mobilenet_v2.preprocess_input
+# Only for backwards compatibility.
+from .mobilenet_v2 import *
diff --git a/keras/applications/nasnet.py b/keras/applications/nasnet.py
index c9c487cd39b..3910cf27e04 100644
--- a/keras/applications/nasnet.py
+++ b/keras/applications/nasnet.py
@@ -3,8 +3,24 @@
 from __future__ import print_function
 
 from keras_applications import nasnet
+from . import keras_modules_injection
 
-NASNetMobile = nasnet.NASNetMobile
-NASNetLarge = nasnet.NASNetLarge
-decode_predictions = nasnet.decode_predictions
-preprocess_input = nasnet.preprocess_input
+
+@keras_modules_injection
+def NASNetMobile(*args, **kwargs):
+    return nasnet.NASNetMobile(*args, **kwargs)
+
+
+@keras_modules_injection
+def NASNetLarge(*args, **kwargs):
+    return nasnet.NASNetLarge(*args, **kwargs)
+
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return nasnet.decode_predictions(*args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return nasnet.preprocess_input(*args, **kwargs)
diff --git a/keras/applications/resnet50.py b/keras/applications/resnet50.py
index 12d885e061b..815050f20c4 100644
--- a/keras/applications/resnet50.py
+++ b/keras/applications/resnet50.py
@@ -3,7 +3,19 @@
 from __future__ import print_function
 
 from keras_applications import resnet50
+from . import keras_modules_injection
 
-ResNet50 = resnet50.ResNet50
-decode_predictions = resnet50.decode_predictions
-preprocess_input = resnet50.preprocess_input
+
+@keras_modules_injection
+def ResNet50(*args, **kwargs):
+    return resnet50.ResNet50(*args, **kwargs)
+
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return resnet50.decode_predictions(*args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return resnet50.preprocess_input(*args, **kwargs)
diff --git a/keras/applications/vgg16.py b/keras/applications/vgg16.py
index 829f350ea65..68a7d8f768b 100644
--- a/keras/applications/vgg16.py
+++ b/keras/applications/vgg16.py
@@ -3,7 +3,19 @@
 from __future__ import print_function
 
 from keras_applications import vgg16
+from . import keras_modules_injection
 
-VGG16 = vgg16.VGG16
-decode_predictions = vgg16.decode_predictions
-preprocess_input = vgg16.preprocess_input
+
+@keras_modules_injection
+def VGG16(*args, **kwargs):
+    return vgg16.VGG16(*args, **kwargs)
+
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return vgg16.decode_predictions(*args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return vgg16.preprocess_input(*args, **kwargs)
diff --git a/keras/applications/vgg19.py b/keras/applications/vgg19.py
index 0d2d1abf060..4d6f83e1d11 100644
--- a/keras/applications/vgg19.py
+++ b/keras/applications/vgg19.py
@@ -3,7 +3,19 @@
 from __future__ import print_function
 
 from keras_applications import vgg19
+from . import keras_modules_injection
 
-VGG19 = vgg19.VGG19
-decode_predictions = vgg19.decode_predictions
-preprocess_input = vgg19.preprocess_input
+
+@keras_modules_injection
+def VGG19(*args, **kwargs):
+    return vgg19.VGG19(*args, **kwargs)
+
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return vgg19.decode_predictions(*args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return vgg19.preprocess_input(*args, **kwargs)
diff --git a/keras/applications/xception.py b/keras/applications/xception.py
index 06f1535314c..0f645280c94 100644
--- a/keras/applications/xception.py
+++ b/keras/applications/xception.py
@@ -3,7 +3,19 @@
 from __future__ import print_function
 
 from keras_applications import xception
+from . import keras_modules_injection
 
-Xception = xception.Xception
-decode_predictions = xception.decode_predictions
-preprocess_input = xception.preprocess_input
+
+@keras_modules_injection
+def Xception(*args, **kwargs):
+    return xception.Xception(*args, **kwargs)
+
+
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+    return xception.decode_predictions(*args, **kwargs)
+
+
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+    return xception.preprocess_input(*args, **kwargs)
diff --git a/keras/backend/cntk_backend.py b/keras/backend/cntk_backend.py
index 2eed3b6040e..5c6228c11fb 100644
--- a/keras/backend/cntk_backend.py
+++ b/keras/backend/cntk_backend.py
@@ -1156,17 +1156,20 @@ def permute_dimensions(x, pattern):
     return C.transpose(x, axis)
 
 
-def resize_images(x, height_factor, width_factor, data_format):
-    if data_format == 'channels_first':
-        output = repeat_elements(x, height_factor, axis=2)
-        output = repeat_elements(output, width_factor, axis=3)
-        return output
-    elif data_format == 'channels_last':
-        output = repeat_elements(x, height_factor, axis=1)
-        output = repeat_elements(output, width_factor, axis=2)
-        return output
+def resize_images(x, height_factor, width_factor, data_format, interpolation='nearest'):
+    if interpolation == 'nearest':
+        if data_format == 'channels_first':
+            output = repeat_elements(x, height_factor, axis=2)
+            output = repeat_elements(output, width_factor, axis=3)
+            return output
+        elif data_format == 'channels_last':
+            output = repeat_elements(x, height_factor, axis=1)
+            output = repeat_elements(output, width_factor, axis=2)
+            return output
+        else:
+            raise ValueError('CNTK Backend: Invalid data_format: %s' % data_format)
     else:
-        raise ValueError('CNTK Backend: Invalid data_format:', data_format)
+        raise NotImplementedError('CNTK only supports `nearest` interpolation.')
 
 
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
@@ -1181,7 +1184,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
         output = repeat_elements(output, width_factor, axis=3)
         return output
     else:
-        raise ValueError('CNTK Backend: Invalid data_format:', data_format)
+        raise ValueError('CNTK Backend: Invalid data_format: %s' % data_format)
 
 
 def repeat_elements(x, rep, axis):
@@ -1436,7 +1439,7 @@ def _recurrence(x, states, m):
             for o, p in zip(new_states, place_holders):
                 n_s.append(o.replace_placeholders({p: o.output}))
             if len(n_s) > 0:
-                new_output = n_s[0]
+                new_output = n_s[-1]
             return new_output, n_s
 
         final_output, final_states = _recurrence(rnn_inputs, states, mask)
@@ -1491,17 +1494,27 @@ def conv1d(x, kernel, strides=1, padding='valid',
 
     if data_format == 'channels_last':
         x = C.swapaxes(x, 0, 1)
-        kernel = C.swapaxes(kernel, 0, 2)
+
+    # As of Keras 2.0.0, all kernels are normalized
+    # on the format `(steps, input_depth, depth)`,
+    # independently of `data_format`.
+    # CNTK expects `(depth, input_depth, steps)`.
+    kernel = C.swapaxes(kernel, 0, 2)
 
     padding = _preprocess_border_mode(padding)
-    strides = [strides]
+
+    if dev.type() == 0 and dilation_rate != 1:
+        raise ValueError('Dilated convolution on CPU is not supported by CNTK backend. '
+                         'Please set `dilation_rate` to 1. You passed: %s' % (dilation_rate,))
+
+    dilation_rate = (1, dilation_rate)
+
     x = C.convolution(
         kernel,
         x,
-        strides=tuple(strides),
-        auto_padding=[
-            False,
-            padding])
+        strides=strides,
+        auto_padding=[False, padding],
+        dilation=dilation_rate)
 
     if data_format == 'channels_last':
         x = C.swapaxes(x, 0, 1)
@@ -1515,27 +1528,20 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid',
     x = _preprocess_conv2d_input(x, data_format)
     kernel = _preprocess_conv2d_kernel(kernel, data_format)
     padding = _preprocess_border_mode(padding)
-    if dilation_rate == (1, 1):
-        strides = (1,) + strides
-        x = C.convolution(
-            kernel,
-            x,
-            strides,
-            auto_padding=[
-                False,
-                padding,
-                padding])
-    else:
-        assert dilation_rate[0] == dilation_rate[1]
-        assert strides == (1, 1), 'Invalid strides for dilated convolution'
-        x = C.convolution(
-            kernel,
-            x,
-            strides=dilation_rate[0],
-            auto_padding=[
-                False,
-                padding,
-                padding])
+
+    if dev.type() == 0 and dilation_rate != (1, 1):
+        raise ValueError('Dilated convolution on CPU is not supported by CNTK backend. '
+                         'Please set `dilation_rate` to (1, 1). '
+                         'You passed: %s' % (dilation_rate,))
+
+    dilation_rate = (1,) + dilation_rate
+
+    x = C.convolution(kernel,
+                      x,
+                      strides,
+                      auto_padding=[False, padding, padding],
+                      dilation=dilation_rate)
+
     return _postprocess_conv2d_output(x, data_format)
 
 
@@ -1659,17 +1665,21 @@ def conv3d(x, kernel, strides=(1, 1, 1), padding='valid',
     x = _preprocess_conv3d_input(x, data_format)
     kernel = _preprocess_conv3d_kernel(kernel, data_format)
     padding = _preprocess_border_mode(padding)
-    strides = strides + (strides[0],)
+
+    if dev.type() == 0 and dilation_rate != (1, 1, 1):
+        raise ValueError('Dilated convolution on CPU is not supported by CNTK backend. '
+                         'Please set `dilation_rate` to (1, 1, 1). '
+                         'You passed: %s' % (dilation_rate,))
+
+    dilation_rate = (1,) + dilation_rate
 
     x = C.convolution(
         kernel,
         x,
         strides,
-        auto_padding=[
-            False,
-            padding,
-            padding,
-            padding])
+        auto_padding=[False, padding, padding, padding],
+        dilation=dilation_rate)
+
     return _postprocess_conv3d_output(x, data_format)
 
 
@@ -1757,14 +1767,25 @@ def pool3d(x, pool_size, strides=(1, 1, 1), padding='valid',
     return _postprocess_conv3d_output(x, data_format)
 
 
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0.):
+
     if alpha != 0.:
-        negative_part = C.relu(-x)
-    x = C.relu(x)
+        if threshold != 0.:
+            negative_part = C.relu(-x + threshold)
+        else:
+            negative_part = C.relu(-x)
+
+    if threshold != 0.:
+        x = x * C.greater(x, threshold)
+    else:
+        x = C.relu(x)
+
     if max_value is not None:
         x = C.clip(x, 0.0, max_value)
+
     if alpha != 0.:
         x -= alpha * negative_part
+
     return x
 
 
@@ -2187,7 +2208,7 @@ def in_top_k(predictions, targets, k):
 
 
 def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
-                     padding='valid', data_format=None):
+                     padding='valid', data_format=None, dilation_rate=(1, 1)):
     data_format = normalize_data_format(data_format)
 
     x = _preprocess_conv2d_input(x, data_format)
@@ -2201,6 +2222,8 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
         output_shape = transpose_shape(output_shape, 'channels_first',
                                        spatial_axes=(0, 1))
 
+    dilation_rate = (1,) + dilation_rate
+
     x = C.convolution_transpose(
         kernel,
         x,
@@ -2209,7 +2232,8 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
             False,
             padding,
             padding],
-        output_shape=output_shape)
+        output_shape=output_shape,
+        dilation=dilation_rate)
     return _postprocess_conv2d_output(x, data_format)
 
 
diff --git a/keras/backend/mxnet_backend.py b/keras/backend/mxnet_backend.py
index 232d20c0ce2..111460fec9f 100644
--- a/keras/backend/mxnet_backend.py
+++ b/keras/backend/mxnet_backend.py
@@ -2079,7 +2079,7 @@ def permute_dimensions(x, pattern):
 
 
 @keras_mxnet_symbol
-def resize_images(x, height_factor, width_factor, data_format):
+def resize_images(x, height_factor, width_factor, data_format, interpolation='nearest'):
     """Resizes the images contained in a 4D tensor.
 
     # Arguments
@@ -2087,6 +2087,7 @@ def resize_images(x, height_factor, width_factor, data_format):
         height_factor: Positive integer.
         width_factor: Positive integer.
         data_format: string, `"channels_last"` or `"channels_first"`.
+        interpolation: A string, one of `nearest` or `bilinear`.
 
     # Returns
         A tensor.
@@ -2464,13 +2465,12 @@ def get_mxnet_module_arg_params(x):
     """
     # retrieve from bind values first, which is up to date with
     # arg_params in mxnet module
+    ret = eval(x)
     if isinstance(x, KerasSymbol):
         if x.tensor is not None:
             if x.name in x.get_bind_values() and _MODEL is not None:
                 _MODEL._sync_weights()
                 ret = x.get_bind_values()[x.name].asnumpy()
-    else:
-        ret = eval(x)
     return ret
 
 
@@ -2771,9 +2771,9 @@ def rnn(step_function, inputs, initial_states,
         if mx.__version__ < '1.3.1':
             raise NotImplementedError('unroll=False in RNN only works with MXNet 1.3.1 or newer, '
                                       'please upgrade to latest master using: pip install --ugprade mxnet --pre')
-
         # defining step functions for each RNN cells, implementation taken from call functions
         # from each RNN cell class in keras.layers.recurrent
+
         def _simple_rnn_cell_step(data, states):
             # Refer to SimpleRNNCell's call function in keras.layers.recurrent
             inputs = data[0]
@@ -3033,7 +3033,8 @@ def _gru_cell_step(data, states):
         # Reverse the input sequence
         if go_backwards:
             inputs = reverse(inputs, 0)
-            mask = reverse(mask, 0)
+            if mask is not None:
+                mask = reverse(mask, 0)
 
         # Transpose to time-major, i.e.
         # from (batch, time, ...) to (time, batch, ...)
@@ -3218,7 +3219,7 @@ def in_test_phase(x, alt, training=None):
 
 # NN OPERATIONS
 @keras_mxnet_symbol
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0.):
     """Rectified linear unit.
 
     With default values, it returns element-wise `max(x, 0)`.
@@ -3231,10 +3232,32 @@ def relu(x, alpha=0., max_value=None):
     # Returns
         A tensor.
     """
-    ret = mx.sym.LeakyReLU(data=x.symbol, act_type='leaky', slope=alpha)
-    if max_value and max_value > 0:
-        ret = mx.sym.minimum(ret, max_value)
-    return KerasSymbol(ret)
+    if alpha != 0.:
+        if max_value is None and threshold == 0.:
+            return KerasSymbol(mx.sym.LeakyReLU(data=x.symbol, act_type='leaky', slope=alpha))
+
+        if threshold != 0.:
+            data = -x + threshold
+            negative_part = mx.sym.LeakyReLU(data=data.symbol, act_type='leaky', slope=0.)
+        else:
+            data = -x
+            negative_part = mx.sym.LeakyReLU(data=data.symbol, act_type='leaky', slope=0.)
+
+    clip_max = max_value is not None
+
+    if threshold != 0:
+        # computes x for x > threshold else 0
+        x = x * cast(greater(x, threshold), floatx())
+    else:
+        x = KerasSymbol(mx.sym.LeakyReLU(data=x.symbol, act_type='leaky', slope=alpha))
+
+    if clip_max:
+        x = KerasSymbol(mx.sym.clip(x.symbol, 0, max_value))
+
+    if alpha != 0:
+        negative_part = alpha * negative_part
+        x = x - KerasSymbol(negative_part)
+    return x
 
 
 @keras_mxnet_symbol
@@ -3252,7 +3275,7 @@ def elu(x, alpha=1.):
 
 
 @keras_mxnet_symbol
-def softmax(x):
+def softmax(x, axis=-1):
     """Softmax of a tensor.
 
     # Arguments
@@ -3261,7 +3284,7 @@ def softmax(x):
     # Returns
         A tensor.
     """
-    return KerasSymbol(mx.sym.softmax(data=x.symbol))
+    return KerasSymbol(mx.sym.softmax(data=x.symbol, axis=axis))
 
 
 @keras_mxnet_symbol
@@ -3671,7 +3694,7 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid',
 
 
 def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
-                     padding='valid', data_format=None):
+                     padding='valid', data_format=None, dilation_rate=(1, 1)):
     """2D deconvolution (i.e. transposed convolution).
 
     # Arguments
@@ -3712,7 +3735,8 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
     if padding not in {'same', 'valid'}:
         raise ValueError('MXNet Backend: `padding` should be either `same` or `valid`.')
 
-    return _convnd_transpose(x, kernel, output_shape, name='conv2d_transpose', strides=strides, data_format=data_format)
+    return _convnd_transpose(x, kernel, output_shape, name='conv2d_transpose',
+                             strides=strides, data_format=data_format, dilation_rate=dilation_rate)
 
 
 def separable_conv1d(x, depthwise_kernel, pointwise_kernel, strides=1,
@@ -5028,7 +5052,7 @@ def _convnd(x, kernel, strides, filter_dilation, name=None, padding_mode='valid'
 
 
 @keras_mxnet_symbol
-def _convnd_transpose(x, kernel, output_shape, strides, data_format, name=None):
+def _convnd_transpose(x, kernel, output_shape, strides, data_format, name=None, dilation_rate=(1, 1)):
     # Handle Data Format
     x = _preprocess_convnd_input(x, data_format)
     kernel = _preprocess_convnd_kernel(kernel, data_format)
@@ -5049,7 +5073,7 @@ def _convnd_transpose(x, kernel, output_shape, strides, data_format, name=None):
     deconv = mx.sym.Deconvolution(data=x.symbol, name=_prepare_name(name, "convnd_transpose"),
                                   kernel=layout_kernel, stride=strides,
                                   num_filter=nb_filter, weight=kernel.symbol,
-                                  no_bias=True, target_shape=output_shape)
+                                  no_bias=True, target_shape=output_shape, dilate=dilation_rate)
 
     # Handle original Data Format
     result = _postprocess_convnd_output(KerasSymbol(deconv), data_format)
@@ -5179,6 +5203,16 @@ def get_mxnet_model_info(model):
     return data_names, data_shapes
 
 
+def get_num_gpus():
+    try:
+        gpus = mx.test_utils.list_gpus()
+    except CalledProcessError:
+        gpus = []
+    if gpus and len(gpus) > 0:
+        return len(gpus)
+    return 0
+
+
 def get_model():
     """Prepares Model class that can be used for training a Keras model with MXNet backend.
     Inherits and extends keras.engine.Model class.
@@ -5603,6 +5637,7 @@ def __init__(self, layers=None, *args, **kwargs):
 
             self.name = kwargs['name']
             engine.Model.__init__(self, *args, **kwargs)
+            self._build_input_shape = None
 
             # Add to the model any layers passed to the constructor.
             if layers:
@@ -5632,8 +5667,10 @@ def _get_lr(self, _):
 
         def get_config(self):
             config = {}
-            if hasattr(self, 'clip_gradient'):
-                config['clipnorm'] = self.clip_gradient
+            if hasattr(self, 'clipnorm'):
+                config['clipnorm'] = self.clipnorm
+            if hasattr(self, 'clipvalue'):
+                config['clipvalue'] = self.clipvalue
             return config
 
     class SGD(MXOptimizer, mx.optimizer.SGD):
diff --git a/keras/backend/tensorflow_backend.py b/keras/backend/tensorflow_backend.py
index 21d15963a36..2d23706948d 100644
--- a/keras/backend/tensorflow_backend.py
+++ b/keras/backend/tensorflow_backend.py
@@ -15,6 +15,7 @@
 from collections import defaultdict
 
 import numpy as np
+from distutils.version import StrictVersion
 import os
 
 from .common import floatx
@@ -1235,7 +1236,9 @@ def max(x, axis=None, keepdims=False):
 
     # Arguments
         x: A tensor or variable.
-        axis: An integer, the axis to find maximum values.
+        axis: An integer or list of integers in [-rank(x), rank(x)),
+            the axes to find maximum values. If `None` (default), finds the
+            maximum over all dimensions.
         keepdims: A boolean, whether to keep the dimensions or not.
             If `keepdims` is `False`, the rank of the tensor is reduced
             by 1. If `keepdims` is `True`,
@@ -1252,7 +1255,9 @@ def min(x, axis=None, keepdims=False):
 
     # Arguments
         x: A tensor or variable.
-        axis: An integer, the axis to find minimum values.
+        axis: An integer or list of integers in [-rank(x), rank(x)),
+            the axes to find minimum values. If `None` (default), finds the
+            minimum over all dimensions.
         keepdims: A boolean, whether to keep the dimensions or not.
             If `keepdims` is `False`, the rank of the tensor is reduced
             by 1. If `keepdims` is `True`,
@@ -1269,7 +1274,9 @@ def sum(x, axis=None, keepdims=False):
 
     # Arguments
         x: A tensor or variable.
-        axis: An integer, the axis to sum over.
+        axis: An integer or list of integers in [-rank(x), rank(x)),
+            the axes to sum over. If `None` (default), sums over all
+            dimensions.
         keepdims: A boolean, whether to keep the dimensions or not.
             If `keepdims` is `False`, the rank of the tensor is reduced
             by 1. If `keepdims` is `True`,
@@ -1286,7 +1293,9 @@ def prod(x, axis=None, keepdims=False):
 
     # Arguments
         x: A tensor or variable.
-        axis: An integer, the axis to compute the product.
+        axis: An integer or list of integers in [-rank(x), rank(x)),
+            the axes to compute the product. If `None` (default), computes
+            the product over all dimensions.
         keepdims: A boolean, whether to keep the dimensions or not.
             If `keepdims` is `False`, the rank of the tensor is reduced
             by 1. If `keepdims` is `True`,
@@ -1329,7 +1338,9 @@ def var(x, axis=None, keepdims=False):
 
     # Arguments
         x: A tensor or variable.
-        axis: An integer, the axis to compute the variance.
+        axis: An integer or list of integers in [-rank(x), rank(x)),
+            the axes to compute the variance. If `None` (default), computes
+            the variance over all dimensions.
         keepdims: A boolean, whether to keep the dimensions or not.
             If `keepdims` is `False`, the rank of the tensor is reduced
             by 1. If `keepdims` is `True`,
@@ -1352,7 +1363,9 @@ def std(x, axis=None, keepdims=False):
 
     # Arguments
         x: A tensor or variable.
-        axis: An integer, the axis to compute the standard deviation.
+        axis: An integer or list of integers in [-rank(x), rank(x)),
+            the axes to compute the standard deviation. If `None` (default),
+            computes the standard deviation over all dimensions.
         keepdims: A boolean, whether to keep the dimensions or not.
             If `keepdims` is `False`, the rank of the tensor is reduced
             by 1. If `keepdims` is `True`,
@@ -1369,7 +1382,9 @@ def mean(x, axis=None, keepdims=False):
 
     # Arguments
         x: A tensor or variable.
-        axis: A list of integer. Axes to compute the mean.
+        axis: An integer or list of integers in [-rank(x), rank(x)),
+            the axes to compute the mean. If `None` (default), computes
+            the mean over all dimensions.
         keepdims: A boolean, whether to keep the dimensions or not.
             If `keepdims` is `False`, the rank of the tensor is reduced
             by 1 for each entry in `axis`. If `keepdims` is `True`,
@@ -1388,7 +1403,9 @@ def any(x, axis=None, keepdims=False):
 
     # Arguments
         x: Tensor or variable.
-        axis: axis along which to perform the reduction.
+        axis: An integer or list of integers in [-rank(x), rank(x)),
+            the axes to compute the logical or. If `None` (default), computes
+            the logical or over all dimensions.
         keepdims: whether the drop or broadcast the reduction axes.
 
     # Returns
@@ -1403,7 +1420,9 @@ def all(x, axis=None, keepdims=False):
 
     # Arguments
         x: Tensor or variable.
-        axis: axis along which to perform the reduction.
+        axis: An integer or list of integers in [-rank(x), rank(x)),
+            the axes to compute the logical and. If `None` (default), computes
+            the logical and over all dimensions.
         keepdims: whether the drop or broadcast the reduction axes.
 
     # Returns
@@ -1511,7 +1530,9 @@ def logsumexp(x, axis=None, keepdims=False):
 
     # Arguments
         x: A tensor or variable.
-        axis: An integer, the axis to reduce over.
+        axis: axis: An integer or list of integers in [-rank(x), rank(x)),
+            the axes to compute the logsumexp. If `None` (default), computes
+            the logsumexp over all dimensions.
         keepdims: A boolean, whether to keep the dimensions or not.
             If `keepdims` is `False`, the rank of the tensor is reduced
             by 1. If `keepdims` is `True`, the reduced dimension is
@@ -1884,17 +1905,17 @@ def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
             # The mean / var / beta / gamma may be processed by broadcast
             # so it may have extra axes with 1, it is not needed and should be removed
             if ndim(mean) > 1:
-                mean = tf.squeeze(mean)
+                mean = tf.reshape(mean, [-1])
             if ndim(var) > 1:
-                var = tf.squeeze(var)
+                var = tf.reshape(var, [-1])
             if beta is None:
                 beta = zeros_like(mean)
             elif ndim(beta) > 1:
-                beta = tf.squeeze(beta)
+                beta = tf.reshape(beta, [-1])
             if gamma is None:
                 gamma = ones_like(mean)
             elif ndim(gamma) > 1:
-                gamma = tf.squeeze(gamma)
+                gamma = tf.reshape(gamma, [-1])
             y, _, _ = tf.nn.fused_batch_norm(
                 x,
                 gamma,
@@ -1962,7 +1983,11 @@ def permute_dimensions(x, pattern):
     return tf.transpose(x, perm=pattern)
 
 
-def resize_images(x, height_factor, width_factor, data_format):
+def resize_images(x,
+                  height_factor,
+                  width_factor,
+                  data_format,
+                  interpolation='nearest'):
     """Resizes the images contained in a 4D tensor.
 
     # Arguments
@@ -1970,6 +1995,7 @@ def resize_images(x, height_factor, width_factor, data_format):
         height_factor: Positive integer.
         width_factor: Positive integer.
         data_format: string, `"channels_last"` or `"channels_first"`.
+        interpolation: A string, one of `nearest` or `bilinear`.
 
     # Returns
         A tensor.
@@ -1978,25 +2004,39 @@ def resize_images(x, height_factor, width_factor, data_format):
         ValueError: if `data_format` is neither `"channels_last"` or `"channels_first"`.
     """
     if data_format == 'channels_first':
-        original_shape = int_shape(x)
-        new_shape = tf.shape(x)[2:]
-        new_shape *= tf.constant(np.array([height_factor, width_factor]).astype('int32'))
+        rows, cols = 2, 3
+    else:
+        rows, cols = 1, 2
+
+    original_shape = int_shape(x)
+    new_shape = tf.shape(x)[rows:cols + 1]
+    new_shape *= tf.constant(np.array([height_factor, width_factor], dtype='int32'))
+
+    if data_format == 'channels_first':
         x = permute_dimensions(x, [0, 2, 3, 1])
+    if interpolation == 'nearest':
         x = tf.image.resize_nearest_neighbor(x, new_shape)
+    elif interpolation == 'bilinear':
+        x = tf.image.resize_bilinear(x, new_shape)
+    else:
+        raise ValueError('interpolation should be one '
+                         'of "nearest" or "bilinear".')
+    if data_format == 'channels_first':
         x = permute_dimensions(x, [0, 3, 1, 2])
-        x.set_shape((None, None, original_shape[2] * height_factor if original_shape[2] is not None else None,
-                     original_shape[3] * width_factor if original_shape[3] is not None else None))
-        return x
-    elif data_format == 'channels_last':
-        original_shape = int_shape(x)
-        new_shape = tf.shape(x)[1:3]
-        new_shape *= tf.constant(np.array([height_factor, width_factor]).astype('int32'))
-        x = tf.image.resize_nearest_neighbor(x, new_shape)
-        x.set_shape((None, original_shape[1] * height_factor if original_shape[1] is not None else None,
-                     original_shape[2] * width_factor if original_shape[2] is not None else None, None))
-        return x
+
+    if original_shape[rows] is None:
+        new_height = None
     else:
-        raise ValueError('Unknown data_format: ' + str(data_format))
+        new_height = original_shape[rows] * height_factor
+
+    if original_shape[cols] is None:
+        new_width = None
+    else:
+        new_width = original_shape[cols] * width_factor
+
+    output_shape = (None, new_height, new_width, None)
+    x.set_shape(transpose_shape(output_shape, data_format, spatial_axes=(1, 2)))
+    return x
 
 
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
@@ -2525,7 +2565,10 @@ def __init__(self, inputs, outputs,
         # (since the outputs of fetches are never returned).
         # This requires us to wrap fetches in `identity` ops.
         self.fetches = [tf.identity(x) for x in self.fetches]
-        self.session_kwargs = session_kwargs
+        # self.session_kwargs is used for _legacy_call
+        self.session_kwargs = session_kwargs.copy()
+        self.run_options = session_kwargs.pop('options', None)
+        self.run_metadata = session_kwargs.pop('run_metadata', None)
         if session_kwargs:
             raise ValueError('Some keys in session_kwargs are not '
                              'supported at this '
@@ -2573,6 +2616,9 @@ def _make_callable(self, feed_arrays, feed_symbols, symbol_vals, session):
             callable_opts.fetch.append(x.name)
         # Handle updates.
         callable_opts.target.append(self.updates_op.name)
+        # Handle run_options.
+        if self.run_options:
+            callable_opts.run_options.CopyFrom(self.run_options)
         # Create callable.
         callable_fn = session._make_callable_from_options(callable_opts)
         # Cache parameters corresponding to the generated callable, so that
@@ -2623,7 +2669,10 @@ def _call(self, inputs):
                                 feed_symbols,
                                 symbol_vals,
                                 session)
-        fetched = self._callable_fn(*array_vals)
+        if self.run_metadata:
+            fetched = self._callable_fn(*array_vals, run_metadata=self.run_metadata)
+        else:
+            fetched = self._callable_fn(*array_vals)
         return fetched[:len(self.outputs)]
 
     def _legacy_call(self, inputs):
@@ -2653,6 +2702,16 @@ def __call__(self, inputs):
                         'supported with sparse inputs.')
                 return self._legacy_call(inputs)
 
+            # callable generated by Session._make_callable_from_options accepts
+            # `run_metadata` keyword argument since TF 1.10
+            if (self.run_metadata and
+                    StrictVersion(tf.__version__.split('-')[0]) < StrictVersion('1.10.0')):
+                if py_any(is_tensor(x) for x in inputs):
+                    raise ValueError(
+                        'In order to feed symbolic tensors to a Keras model and set '
+                        '`run_metadata`, you need tensorflow 1.10 or higher.')
+                return self._legacy_call(inputs)
+
             return self._call(inputs)
         else:
             if py_any(is_tensor(x) for x in inputs):
@@ -3089,27 +3148,55 @@ def in_test_phase(x, alt, training=None):
 
 # NN OPERATIONS
 
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0.):
     """Rectified linear unit.
 
     With default values, it returns element-wise `max(x, 0)`.
 
+    Otherwise, it follows:
+    `f(x) = max_value` for `x >= max_value`,
+    `f(x) = x` for `threshold <= x < max_value`,
+    `f(x) = alpha * (x - threshold)` otherwise.
+
     # Arguments
         x: A tensor or variable.
         alpha: A scalar, slope of negative section (default=`0.`).
-        max_value: Saturation threshold.
+        max_value: float. Saturation threshold.
+        threshold: float. Threshold value for thresholded activation.
 
     # Returns
         A tensor.
     """
+
     if alpha != 0.:
-        x = tf.nn.leaky_relu(x, alpha)
+        if max_value is None and threshold == 0.:
+            return tf.nn.leaky_relu(x, alpha=alpha)
+
+        if threshold != 0.:
+            negative_part = tf.nn.relu(-x + threshold)
+        else:
+            negative_part = tf.nn.relu(-x)
+
+    clip_max = max_value is not None
+
+    if threshold != 0:
+        # computes x for x > threshold else 0
+        x = x * tf.cast(tf.greater(x, threshold), floatx())
+    elif max_value == 6:
+        # if no threshold, then can use nn.relu6 native TF op for performance
+        x = tf.nn.relu6(x)
+        clip_max = False
     else:
         x = tf.nn.relu(x)
 
-    if max_value is not None:
+    if clip_max:
         max_value = _to_tensor(max_value, x.dtype.base_dtype)
-        x = tf.minimum(x, max_value)
+        zero = _to_tensor(0., x.dtype.base_dtype)
+        x = tf.clip_by_value(x, zero, max_value)
+
+    if alpha != 0:
+        alpha = _to_tensor(alpha, x.dtype.base_dtype)
+        x -= alpha * negative_part
     return x
 
 
@@ -3400,7 +3487,9 @@ def _preprocess_conv1d_input(x, data_format):
     # Returns
         A tensor.
     """
-    if dtype(x) == 'float64':
+    # tensorflow doesn't support float64 for conv layer before 1.8.0
+    if (dtype(x) == 'float64' and
+            StrictVersion(tf.__version__.split('-')[0]) < StrictVersion('1.8.0')):
         x = tf.cast(x, 'float32')
     tf_data_format = 'NWC'  # to pass TF Conv2dNative operations
     if data_format == 'channels_first':
@@ -3411,21 +3500,25 @@ def _preprocess_conv1d_input(x, data_format):
     return x, tf_data_format
 
 
-def _preprocess_conv2d_input(x, data_format):
+def _preprocess_conv2d_input(x, data_format, force_transpose=False):
     """Transpose and cast the input before the conv2d.
 
     # Arguments
         x: input tensor.
         data_format: string, `"channels_last"` or `"channels_first"`.
+        force_transpose: boolean, whether force to transpose input from NCHW to NHWC
+                        if the `data_format` is `"channels_first"`.
 
     # Returns
         A tensor.
     """
-    if dtype(x) == 'float64':
+    # tensorflow doesn't support float64 for conv layer before 1.8.0
+    if (dtype(x) == 'float64' and
+            StrictVersion(tf.__version__.split('-')[0]) < StrictVersion('1.8.0')):
         x = tf.cast(x, 'float32')
     tf_data_format = 'NHWC'
     if data_format == 'channels_first':
-        if not _has_nchw_support():
+        if not _has_nchw_support() or force_transpose:
             x = tf.transpose(x, (0, 2, 3, 1))  # NCHW -> NHWC
         else:
             tf_data_format = 'NCHW'
@@ -3442,7 +3535,9 @@ def _preprocess_conv3d_input(x, data_format):
     # Returns
         A tensor.
     """
-    if dtype(x) == 'float64':
+    # tensorflow doesn't support float64 for conv layer before 1.8.0
+    if (dtype(x) == 'float64' and
+            StrictVersion(tf.__version__.split('-')[0]) < StrictVersion('1.8.0')):
         x = tf.cast(x, 'float32')
     tf_data_format = 'NDHWC'
     if data_format == 'channels_first':
@@ -3560,7 +3655,7 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid',
 
 
 def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
-                     padding='valid', data_format=None):
+                     padding='valid', data_format=None, dilation_rate=(1, 1)):
     """2D deconvolution (i.e. transposed convolution).
 
     # Arguments
@@ -3572,6 +3667,7 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
         data_format: string, `"channels_last"` or `"channels_first"`.
             Whether to use Theano or TensorFlow/CNTK data format
             for inputs/kernels/outputs.
+        dilation_rate: tuple of 2 integers.
 
     # Returns
         A tensor, result of transposed 2D convolution.
@@ -3584,7 +3680,13 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
     if isinstance(output_shape, (tuple, list)):
         output_shape = tf.stack(output_shape)
 
-    x, tf_data_format = _preprocess_conv2d_input(x, data_format)
+    # tf.nn.atrous_conv2d_transpose input only supports NHWC format
+    if data_format == 'channels_first' and dilation_rate != (1, 1):
+        force_transpose = True
+    else:
+        force_transpose = False
+
+    x, tf_data_format = _preprocess_conv2d_input(x, data_format, force_transpose)
 
     if data_format == 'channels_first' and tf_data_format == 'NHWC':
         output_shape = (output_shape[0],
@@ -3601,9 +3703,15 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
     else:
         strides = (1, 1) + strides
 
-    x = tf.nn.conv2d_transpose(x, kernel, output_shape, strides,
-                               padding=padding,
-                               data_format=tf_data_format)
+    if dilation_rate == (1, 1):
+        x = tf.nn.conv2d_transpose(x, kernel, output_shape, strides,
+                                   padding=padding,
+                                   data_format=tf_data_format)
+    else:
+        assert dilation_rate[0] == dilation_rate[1]
+        x = tf.nn.atrous_conv2d_transpose(
+            x, kernel, output_shape, dilation_rate[0], padding)
+
     if data_format == 'channels_first' and tf_data_format == 'NHWC':
         x = tf.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
     return x
diff --git a/keras/backend/theano_backend.py b/keras/backend/theano_backend.py
index 70f10bb7c48..7684006cc0e 100644
--- a/keras/backend/theano_backend.py
+++ b/keras/backend/theano_backend.py
@@ -589,6 +589,19 @@ def any(x, axis=None, keepdims=False):
     """Bitwise reduction (logical OR).
     """
     y = T.any(x, axis=axis, keepdims=keepdims)
+    y = _set_keras_shape_for_reduction(x, y, axis, keepdims)
+    return y
+
+
+def all(x, axis=None, keepdims=False):
+    """Bitwise reduction (logical AND).
+    """
+    y = T.all(x, axis=axis, keepdims=keepdims)
+    y = _set_keras_shape_for_reduction(x, y, axis, keepdims)
+    return y
+
+
+def _set_keras_shape_for_reduction(x, y, axis, keepdims):
     if hasattr(x, '_keras_shape'):
         if axis is None:
             y._keras_shape = (1,) * len(x._keras_shape) if keepdims else (1,)
@@ -610,12 +623,6 @@ def any(x, axis=None, keepdims=False):
     return y
 
 
-def all(x, axis=None, keepdims=False):
-    """Bitwise reduction (logical AND).
-    """
-    return T.all(x, axis=axis, keepdims=keepdims)
-
-
 def argmax(x, axis=-1):
     return T.argmax(x, axis=axis, keepdims=False)
 
@@ -885,13 +892,25 @@ def concatenate(tensors, axis=-1):
     if py_all([is_sparse(x) for x in tensors]):
         axis = axis % ndim(tensors[0])
         if axis == 0:
-            return th_sparse_module.basic.vstack(tensors, format='csr')
+            output = th_sparse_module.basic.vstack(tensors, format='csr')
         elif axis == 1:
-            return th_sparse_module.basic.hstack(tensors, format='csr')
+            output = th_sparse_module.basic.hstack(tensors, format='csr')
         else:
             raise ValueError('Invalid concat axis for sparse matrix:', axis)
     else:
-        return T.concatenate([to_dense(x) for x in tensors], axis=axis)
+        output = T.concatenate([to_dense(x) for x in tensors], axis=axis)
+
+    if py_all([hasattr(tensor, '_keras_shape') for tensor in tensors]):
+        input_shapes = [tensor._keras_shape for tensor in tensors]
+        output_shape = list(input_shapes[0])
+        for shape in input_shapes[1:]:
+            if output_shape[axis] is None or shape[axis] is None:
+                output_shape[axis] = None
+                break
+            output_shape[axis] += shape[axis]
+        output._keras_shape = tuple(output_shape)
+
+    return output
 
 
 def reshape(x, shape):
@@ -934,7 +953,11 @@ def repeat_elements(x, rep, axis):
     return y
 
 
-def resize_images(x, height_factor, width_factor, data_format):
+def resize_images(x,
+                  height_factor,
+                  width_factor,
+                  data_format,
+                  interpolation='nearest'):
     """Resize the images contained in a 4D tensor of shape
     - [batch, channels, height, width] (for 'channels_first' data_format)
     - [batch, height, width, channels] (for 'channels_last' data_format)
@@ -942,16 +965,40 @@ def resize_images(x, height_factor, width_factor, data_format):
     positive integers.
     """
     if data_format == 'channels_first':
-        output = repeat_elements(x, height_factor, axis=2)
-        output = repeat_elements(output, width_factor, axis=3)
-        return output
+        axis_1 = 2
+        axis_2 = 3
     elif data_format == 'channels_last':
-        output = repeat_elements(x, height_factor, axis=1)
-        output = repeat_elements(output, width_factor, axis=2)
-        return output
+        axis_1 = 1
+        axis_2 = 2
     else:
         raise ValueError('Invalid data_format:', data_format)
 
+    if interpolation == 'nearest':
+        output = repeat_elements(x, height_factor, axis=axis_1)
+        output = repeat_elements(output, width_factor, axis=axis_2)
+    elif interpolation == 'bilinear':
+        if not (height_factor == width_factor == 2):
+            raise NotImplementedError(
+                'Bilinear upscaling with factors other than (2, 2)'
+                'is not available when using the Theano backend.')
+        if data_format == 'channels_last':
+            output = permute_dimensions(x, [0, 3, 1, 2])
+        else:
+            output = x
+        output = T.nnet.abstract_conv.bilinear_upsampling(output,
+                                                          ratio=height_factor)
+        if data_format == 'channels_last':
+            output = permute_dimensions(output, [0, 2, 3, 1])
+        if hasattr(x, '_keras_shape'):
+            output._keras_shape = list(x._keras_shape)
+            output._keras_shape[axis_1] *= height_factor
+            output._keras_shape[axis_2] *= width_factor
+            output._keras_shape = tuple(output._keras_shape)
+    else:
+        raise ValueError('interpolation should be one of "nearest" or "bilinear".')
+
+    return output
+
 
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
     """Resize the volume contained in a 5D tensor of shape
@@ -1144,7 +1191,34 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
                    py_slice(left_pad, input_shape[2] + left_pad),
                    py_slice(None))
     y = T.set_subtensor(output[indices], x)
-    y._keras_shape = output_shape
+    if hasattr(x, '_keras_shape'):
+        if data_format == 'channels_first':
+            if x._keras_shape[2] is not None:
+                h = x._keras_shape[2] + top_pad + bottom_pad
+            else:
+                h = None
+            if x._keras_shape[3] is not None:
+                w = x._keras_shape[3] + left_pad + right_pad
+            else:
+                w = None
+            output_keras_shape = (x._keras_shape[0],
+                                  x._keras_shape[1],
+                                  h,
+                                  w)
+        else:
+            if x._keras_shape[1] is not None:
+                h = x._keras_shape[1] + top_pad + bottom_pad
+            else:
+                h = None
+            if x._keras_shape[2] is not None:
+                w = x._keras_shape[2] + left_pad + right_pad
+            else:
+                w = None
+            output_keras_shape = (x._keras_shape[0],
+                                  h,
+                                  w,
+                                  x._keras_shape[3])
+        y._keras_shape = output_keras_shape
     return y
 
 
@@ -1180,7 +1254,46 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
                    py_slice(padding[1][0], input_shape[2] + padding[1][0]),
                    py_slice(padding[2][0], input_shape[3] + padding[2][0]),
                    py_slice(None))
-    return T.set_subtensor(output[indices], x)
+    y = T.set_subtensor(output[indices], x)
+    if hasattr(x, '_keras_shape'):
+        if data_format == 'channels_first':
+            if x._keras_shape[2] is not None:
+                h = x._keras_shape[2] + padding[0][0] + padding[0][1]
+            else:
+                h = None
+            if x._keras_shape[3] is not None:
+                w = x._keras_shape[3] + padding[1][0] + padding[1][1]
+            else:
+                w = None
+            if x._keras_shape[4] is not None:
+                d = x._keras_shape[4] + padding[2][0] + padding[2][1]
+            else:
+                d = None
+            output_keras_shape = (x._keras_shape[0],
+                                  x._keras_shape[1],
+                                  h,
+                                  w,
+                                  d)
+        else:
+            if x._keras_shape[1] is not None:
+                h = x._keras_shape[1] + padding[0][0] + padding[0][1]
+            else:
+                h = None
+            if x._keras_shape[2] is not None:
+                w = x._keras_shape[2] + padding[1][0] + padding[1][1]
+            else:
+                w = None
+            if x._keras_shape[3] is not None:
+                d = x._keras_shape[3] + padding[2][0] + padding[2][1]
+            else:
+                d = None
+            output_keras_shape = (x._keras_shape[0],
+                                  h,
+                                  w,
+                                  d,
+                                  x._keras_shape[4])
+        y._keras_shape = output_keras_shape
+    return y
 
 
 def stack(x, axis=0):
@@ -1594,19 +1707,35 @@ def elu(x, alpha=1.0):
     return T.nnet.elu(x, alpha)
 
 
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0.):
     _assert_has_capability(T.nnet, 'relu')
-    x = T.nnet.relu(x, alpha)
+
+    if alpha != 0.:
+        if threshold != 0.:
+            negative_part = T.nnet.relu(-x + threshold)
+        else:
+            negative_part = T.nnet.relu(-x)
+
+    if threshold != 0.:
+        x = x * T.cast(T.gt(x, threshold), floatx())
+    else:
+        x = T.nnet.relu(x)
+
     if max_value is not None:
-        x = T.minimum(x, max_value)
+        x = T.clip(x, 0.0, max_value)
+
+    if alpha != 0.:
+        x -= alpha * negative_part
+
     return x
 
 
 def softmax(x, axis=-1):
-    if axis == -1 or axis == x.ndim - 1:
+    if (axis == -1 or axis == x.ndim - 1) and x.ndim == 2:
         return T.nnet.softmax(x)
-    return T.exp(x - x.max()) / T.exp(
-        x - x.max()).sum(axis=axis, keepdims=True)
+    xm = x.max(axis=axis, keepdims=True)
+    return T.exp(x - xm) / T.exp(
+        x - xm).sum(axis=axis, keepdims=True)
 
 
 def softplus(x):
@@ -2009,7 +2138,7 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid',
 
 
 def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
-                     padding='valid', data_format=None):
+                     padding='valid', data_format=None, dilation_rate=(1, 1)):
     """2D deconvolution (transposed convolution).
 
     # Arguments
@@ -2019,7 +2148,8 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
         padding: string, "same" or "valid".
         data_format: "channels_last" or "channels_first".
             Whether to use Theano or TensorFlow data format
-        in inputs/kernels/outputs.
+            in inputs/kernels/outputs.
+        dilation_rate: tuple of 2 integers.
 
     # Raises
         ValueError: if using an even kernel size with padding 'same'.
@@ -2052,7 +2182,8 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
                                                         kshp=kernel_shape,
                                                         subsample=strides,
                                                         border_mode=th_padding,
-                                                        filter_flip=not flip_filters)
+                                                        filter_flip=not flip_filters,
+                                                        filter_dilation=dilation_rate)
     conv_out = op(kernel, x, output_shape[2:])
     conv_out = _postprocess_conv2d_output(conv_out, x, padding,
                                           kernel_shape, strides, data_format)
diff --git a/keras/callbacks.py b/keras/callbacks.py
index f151e688033..b9300429bfd 100644
--- a/keras/callbacks.py
+++ b/keras/callbacks.py
@@ -12,6 +12,8 @@
 import time
 import json
 import warnings
+import io
+import sys
 
 from collections import deque
 from collections import OrderedDict
@@ -117,9 +119,10 @@ def on_batch_end(self, batch, logs=None):
         delta_t_median = np.median(self._delta_ts_batch_end)
         if (self._delta_t_batch > 0. and
            (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
-            warnings.warn('Method on_batch_end() is slow compared '
-                          'to the batch update (%f). Check your callbacks.'
-                          % delta_t_median)
+            warnings.warn('In your callbacks, method `on_batch_end()` '
+                          'is slow compared to a model step '
+                          '(%f vs %f). Check your callbacks.'
+                          % (delta_t_median, self._delta_t_batch))
 
     def on_train_begin(self, logs=None):
         """Called at the beginning of training.
@@ -480,6 +483,10 @@ class EarlyStopping(Callback):
         baseline: Baseline value for the monitored quantity to reach.
             Training will stop if the model doesn't show improvement
             over the baseline.
+        restore_best_weights: whether to restore model weights from
+            the epoch with the best value of the monitored quantity.
+            If False, the model weights obtained at the last step of
+            training are used.
     """
 
     def __init__(self,
@@ -488,7 +495,8 @@ def __init__(self,
                  patience=0,
                  verbose=0,
                  mode='auto',
-                 baseline=None):
+                 baseline=None,
+                 restore_best_weights=False):
         super(EarlyStopping, self).__init__()
 
         self.monitor = monitor
@@ -498,6 +506,8 @@ def __init__(self,
         self.min_delta = min_delta
         self.wait = 0
         self.stopped_epoch = 0
+        self.restore_best_weights = restore_best_weights
+        self.best_weights = None
 
         if mode not in ['auto', 'min', 'max']:
             warnings.warn('EarlyStopping mode %s is unknown, '
@@ -530,27 +540,40 @@ def on_train_begin(self, logs=None):
             self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
     def on_epoch_end(self, epoch, logs=None):
-        current = logs.get(self.monitor)
+        current = self.get_monitor_value(logs)
         if current is None:
-            warnings.warn(
-                'Early stopping conditioned on metric `%s` '
-                'which is not available. Available metrics are: %s' %
-                (self.monitor, ','.join(list(logs.keys()))), RuntimeWarning
-            )
             return
+
         if self.monitor_op(current - self.min_delta, self.best):
             self.best = current
             self.wait = 0
+            if self.restore_best_weights:
+                self.best_weights = self.model.get_weights()
         else:
             self.wait += 1
             if self.wait >= self.patience:
                 self.stopped_epoch = epoch
                 self.model.stop_training = True
+                if self.restore_best_weights:
+                    if self.verbose > 0:
+                        print('Restoring model weights from the end of '
+                              'the best epoch')
+                    self.model.set_weights(self.best_weights)
 
     def on_train_end(self, logs=None):
         if self.stopped_epoch > 0 and self.verbose > 0:
             print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))
 
+    def get_monitor_value(self, logs):
+        monitor_value = logs.get(self.monitor)
+        if monitor_value is None:
+            warnings.warn(
+                'Early stopping conditioned on metric `%s` '
+                'which is not available. Available metrics are: %s' %
+                (self.monitor, ','.join(list(logs.keys()))), RuntimeWarning
+            )
+        return monitor_value
+
 
 class RemoteMonitor(Callback):
     """Callback used to stream events to a server.
@@ -559,16 +582,18 @@ class RemoteMonitor(Callback):
     Events are sent to `root + '/publish/epoch/end/'` by default. Calls are
     HTTP POST, with a `data` argument which is a
     JSON-encoded dictionary of event data.
-    If send_as_json is set to True, the content type of the request will be application/json.
-    Otherwise the serialized JSON will be send within a form
+    If send_as_json is set to True, the content type of the request will be
+    application/json. Otherwise the serialized JSON will be send within a form
 
     # Arguments
         root: String; root url of the target server.
         path: String; path relative to `root` to which the events will be sent.
-        field: String; JSON field under which the data will be stored. The field is used only if the payload is sent
-        within a form (i.e. send_as_json is set to False).
+        field: String; JSON field under which the data will be stored.
+            The field is used only if the payload is sent within a form
+            (i.e. send_as_json is set to False).
         headers: Dictionary; optional custom HTTP headers.
-        send_as_json: Boolean; whether the request should be send as application/json.
+        send_as_json: Boolean; whether the request should be send as
+            application/json.
     """
 
     def __init__(self,
@@ -640,6 +665,10 @@ def on_epoch_begin(self, epoch, logs=None):
             print('\nEpoch %05d: LearningRateScheduler setting learning '
                   'rate to %s.' % (epoch + 1, lr))
 
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        logs['lr'] = K.get_value(self.model.optimizer.lr)
+
 
 class TensorBoard(Callback):
     """TensorBoard basic visualizations.
@@ -692,7 +721,14 @@ class TensorBoard(Callback):
         embeddings_data: data to be embedded at layers specified in
             `embeddings_layer_names`. Numpy array (if the model has a single
             input) or list of Numpy arrays (if the model has multiple inputs).
-            Learn [more about embeddings](https://www.tensorflow.org/programmers_guide/embedding)
+            Learn [more about embeddings]
+            (https://www.tensorflow.org/programmers_guide/embedding).
+        update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`, writes
+            the losses and metrics to TensorBoard after each batch. The same
+            applies for `'epoch'`. If using an integer, let's say `10000`,
+            the callback will write the metrics and losses to TensorBoard every
+            10000 samples. Note that writing too frequently to TensorBoard
+            can slow down your training.
     """
 
     def __init__(self, log_dir='./logs',
@@ -704,14 +740,16 @@ def __init__(self, log_dir='./logs',
                  embeddings_freq=0,
                  embeddings_layer_names=None,
                  embeddings_metadata=None,
-                 embeddings_data=None):
+                 embeddings_data=None,
+                 update_freq='epoch'):
         super(TensorBoard, self).__init__()
         global tf, projector
         try:
             import tensorflow as tf
             from tensorflow.contrib.tensorboard.plugins import projector
         except ImportError:
-            raise ImportError('You need the TensorFlow module installed to use TensorBoard.')
+            raise ImportError('You need the TensorFlow module installed to '
+                              'use TensorBoard.')
 
         if K.backend() != 'tensorflow':
             if histogram_freq != 0:
@@ -742,6 +780,13 @@ def __init__(self, log_dir='./logs',
         self.embeddings_metadata = embeddings_metadata or {}
         self.batch_size = batch_size
         self.embeddings_data = embeddings_data
+        if update_freq == 'batch':
+            # It is the same as writing as frequently as possible.
+            self.update_freq = 1
+        else:
+            self.update_freq = update_freq
+        self.samples_seen = 0
+        self.samples_seen_at_last_write = 0
 
     def set_model(self, model):
         self.model = model
@@ -762,7 +807,8 @@ def is_indexed_slices(grad):
                         grads = [
                             grad.values if is_indexed_slices(grad) else grad
                             for grad in grads]
-                        tf.summary.histogram('{}_grad'.format(mapped_weight_name), grads)
+                        tf.summary.histogram('{}_grad'.format(mapped_weight_name),
+                                             grads)
                     if self.write_images:
                         w_img = tf.squeeze(weight)
                         shape = K.int_shape(w_img)
@@ -800,7 +846,8 @@ def is_indexed_slices(grad):
                 if hasattr(layer, 'output'):
                     if isinstance(layer.output, list):
                         for i, output in enumerate(layer.output):
-                            tf.summary.histogram('{}_out_{}'.format(layer.name, i), output)
+                            tf.summary.histogram('{}_out_{}'.format(layer.name, i),
+                                                 output)
                     else:
                         tf.summary.histogram('{}_out'.format(layer.name),
                                              layer.output)
@@ -813,7 +860,8 @@ def is_indexed_slices(grad):
             self.writer = tf.summary.FileWriter(self.log_dir)
 
         if self.embeddings_freq and self.embeddings_data is not None:
-            self.embeddings_data = standardize_input_data(self.embeddings_data, model.input_names)
+            self.embeddings_data = standardize_input_data(self.embeddings_data,
+                                                          model.input_names)
 
             embeddings_layer_names = self.embeddings_layer_names
 
@@ -920,8 +968,8 @@ def on_epoch_end(self, epoch, logs=None):
                     batch = slice(i, i + step)
 
                     if type(self.model.input) == list:
-                        feed_dict = {model_input: embeddings_data[idx][batch]
-                                     for idx, model_input in enumerate(self.model.input)}
+                        feed_dict = {_input: embeddings_data[idx][batch]
+                                     for idx, _input in enumerate(self.model.input)}
                     else:
                         feed_dict = {self.model.input: embeddings_data[0][batch]}
 
@@ -932,24 +980,43 @@ def on_epoch_end(self, epoch, logs=None):
 
                     self.sess.run(self.assign_embeddings, feed_dict=feed_dict)
                     self.saver.save(self.sess,
-                                    os.path.join(self.log_dir, 'keras_embedding.ckpt'),
+                                    os.path.join(self.log_dir,
+                                                 'keras_embedding.ckpt'),
                                     epoch)
 
                     i += self.batch_size
 
+        if self.update_freq == 'epoch':
+            index = epoch
+        else:
+            index = self.samples_seen
+        self._write_logs(logs, index)
+
+    def _write_logs(self, logs, index):
         for name, value in logs.items():
             if name in ['batch', 'size']:
                 continue
             summary = tf.Summary()
             summary_value = summary.value.add()
-            summary_value.simple_value = value.item()
+            if isinstance(value, np.ndarray):
+                summary_value.simple_value = value.item()
+            else:
+                summary_value.simple_value = value
             summary_value.tag = name
-            self.writer.add_summary(summary, epoch)
+            self.writer.add_summary(summary, index)
         self.writer.flush()
 
     def on_train_end(self, _):
         self.writer.close()
 
+    def on_batch_end(self, batch, logs=None):
+        if self.update_freq != 'epoch':
+            self.samples_seen += logs['size']
+            samples_seen_since = self.samples_seen - self.samples_seen_at_last_write
+            if samples_seen_since >= self.update_freq:
+                self._write_logs(logs, self.samples_seen)
+                self.samples_seen_at_last_write = self.samples_seen
+
 
 class ReduceLROnPlateau(Callback):
     """Reduce learning rate when a metric has stopped improving.
@@ -1063,8 +1130,8 @@ def on_epoch_end(self, epoch, logs=None):
                         new_lr = max(new_lr, self.min_lr)
                         K.set_value(self.model.optimizer.lr, new_lr)
                         if self.verbose > 0:
-                            print('\nEpoch %05d: ReduceLROnPlateau reducing learning '
-                                  'rate to %s.' % (epoch + 1, new_lr))
+                            print('\nEpoch %05d: ReduceLROnPlateau reducing '
+                                  'learning rate to %s.' % (epoch + 1, new_lr))
                         self.cooldown_counter = self.cooldown
                         self.wait = 0
 
@@ -1099,7 +1166,12 @@ def __init__(self, filename, separator=',', append=False):
         self.writer = None
         self.keys = None
         self.append_header = True
-        self.file_flags = 'b' if six.PY2 and os.name == 'nt' else ''
+        if six.PY2:
+            self.file_flags = 'b'
+            self._open_args = {}
+        else:
+            self.file_flags = ''
+            self._open_args = {'newline': '\n'}
         super(CSVLogger, self).__init__()
 
     def on_train_begin(self, logs=None):
@@ -1107,9 +1179,12 @@ def on_train_begin(self, logs=None):
             if os.path.exists(self.filename):
                 with open(self.filename, 'r' + self.file_flags) as f:
                     self.append_header = not bool(len(f.readline()))
-            self.csv_file = open(self.filename, 'a' + self.file_flags)
+            mode = 'a'
         else:
-            self.csv_file = open(self.filename, 'w' + self.file_flags)
+            mode = 'w'
+        self.csv_file = io.open(self.filename,
+                                mode + self.file_flags,
+                                **self._open_args)
 
     def on_epoch_end(self, epoch, logs=None):
         logs = logs or {}
@@ -1128,14 +1203,17 @@ def handle_value(k):
 
         if self.model.stop_training:
             # We set NA so that csv parsers do not fail for this last epoch.
-            logs = dict([(k, logs[k]) if k in logs else (k, 'NA') for k in self.keys])
+            logs = dict([(k, logs[k] if k in logs else 'NA') for k in self.keys])
 
         if not self.writer:
             class CustomDialect(csv.excel):
                 delimiter = self.sep
-
+            fieldnames = ['epoch'] + self.keys
+            if six.PY2:
+                fieldnames = [unicode(x) for x in fieldnames]
             self.writer = csv.DictWriter(self.csv_file,
-                                         fieldnames=['epoch'] + self.keys, dialect=CustomDialect)
+                                         fieldnames=fieldnames,
+                                         dialect=CustomDialect)
             if self.append_header:
                 self.writer.writeheader()
 
@@ -1243,10 +1321,10 @@ class MXNetModelCheckpoint(ModelCheckpoint):
        If save_best_only is True, saves the MXNet model in the format '<prefix>-symbol.json' and '<prefix>-0000.params'.
        i.e., uses 0000 as placeholder for the epoch. You will have only one best model at the end of training.
 
-       If save_best_only is False, i.e., you want to save the MXNet model after each epoch, this callback saves the Model
-       in the format '<prefix>-symbol.json' and '<prefix>-<epoch>.params'. Example: If you are running the training job
-       for 2 epochs, you will have one symbol file - '<prefix>-symbol.json' and 2 params file, one each for the 2 epochs,
-       '<prefix>-0000.params' and '<prefix>-0001.params'
+       If save_best_only is False, i.e., you want to save the MXNet model after each epoch, this callback saves the
+       Model in the format '<prefix>-symbol.json' and '<prefix>-<epoch>.params'. Example: If you are running the
+       training job for 2 epochs, you will have one symbol file - '<prefix>-symbol.json' and 2 params file, one each
+       for the 2 epochs, '<prefix>-0000.params' and '<prefix>-0001.params'
 
        # Arguments
            prefix: Prefix name of the saved MXNet Model (symbol and params) files.
diff --git a/keras/constraints.py b/keras/constraints.py
index b3a97edb533..c1711c11a05 100644
--- a/keras/constraints.py
+++ b/keras/constraints.py
@@ -40,7 +40,8 @@ class MaxNorm(Constraint):
             `(rows, cols, input_depth)`.
 
     # References
-        - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting Srivastava, Hinton, et al. 2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
+        - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting]
+          (http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
     """
 
     def __init__(self, max_value=2, axis=0):
diff --git a/keras/datasets/boston_housing.py b/keras/datasets/boston_housing.py
index e7d23810d26..6e5bfc82ca0 100644
--- a/keras/datasets/boston_housing.py
+++ b/keras/datasets/boston_housing.py
@@ -22,13 +22,13 @@ def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
         Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
     """
     assert 0 <= test_split < 1
-    path = get_file(path,
-                    origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz',
-                    file_hash='f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5')
-    f = np.load(path)
-    x = f['x']
-    y = f['y']
-    f.close()
+    path = get_file(
+        path,
+        origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz',
+        file_hash='f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5')
+    with np.load(path) as f:
+        x = f['x']
+        y = f['y']
 
     np.random.seed(seed)
     indices = np.arange(len(x))
diff --git a/keras/datasets/imdb.py b/keras/datasets/imdb.py
index 8f6cb4f9516..2d5518d04ea 100644
--- a/keras/datasets/imdb.py
+++ b/keras/datasets/imdb.py
@@ -91,9 +91,11 @@ def load_data(path='imdb.npz', num_words=None, skip_top=0,
     # reserve 'index_from' (=3 by default) characters:
     # 0 (padding), 1 (start), 2 (OOV)
     if oov_char is not None:
-        xs = [[w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs]
+        xs = [[w if (skip_top <= w < num_words) else oov_char for w in x]
+              for x in xs]
     else:
-        xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
+        xs = [[w for w in x if skip_top <= w < num_words]
+              for x in xs]
 
     idx = len(x_train)
     x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
@@ -111,8 +113,9 @@ def get_word_index(path='imdb_word_index.json'):
     # Returns
         The word index dictionary.
     """
-    path = get_file(path,
-                    origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json',
-                    file_hash='bfafd718b763782e994055a2d397834f')
+    path = get_file(
+        path,
+        origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json',
+        file_hash='bfafd718b763782e994055a2d397834f')
     with open(path) as f:
         return json.load(f)
diff --git a/keras/datasets/reuters.py b/keras/datasets/reuters.py
index 44aff6070cc..04fd31fb3b9 100644
--- a/keras/datasets/reuters.py
+++ b/keras/datasets/reuters.py
@@ -97,9 +97,10 @@ def get_word_index(path='reuters_word_index.json'):
     # Returns
         The word index dictionary.
     """
-    path = get_file(path,
-                    origin='https://s3.amazonaws.com/text-datasets/reuters_word_index.json',
-                    file_hash='4d44cc38712099c9e383dc6e5f11a921')
+    path = get_file(
+        path,
+        origin='https://s3.amazonaws.com/text-datasets/reuters_word_index.json',
+        file_hash='4d44cc38712099c9e383dc6e5f11a921')
     f = open(path)
     data = json.load(f)
     f.close()
diff --git a/keras/engine/network.py b/keras/engine/network.py
index 51ea1d12b95..530ab9d9c6d 100644
--- a/keras/engine/network.py
+++ b/keras/engine/network.py
@@ -139,14 +139,8 @@ def _base_init(self, name=None):
     def _init_graph_network(self, inputs, outputs, name=None):
         self._uses_inputs_arg = True
         # Normalize and set self.inputs, self.outputs.
-        if isinstance(inputs, (list, tuple)):
-            self.inputs = list(inputs)  # Tensor or list of tensors.
-        else:
-            self.inputs = [inputs]
-        if isinstance(outputs, (list, tuple)):
-            self.outputs = list(outputs)
-        else:
-            self.outputs = [outputs]
+        self.inputs = to_list(inputs, allow_tuple=True)
+        self.outputs = to_list(outputs, allow_tuple=True)
 
         # User-provided argument validation.
         # Check for redundancy in inputs.
@@ -635,11 +629,13 @@ def compute_output_shape(self, input_shape):
                             inbound_layer = node.inbound_layers[j]
                             node_index = node.node_indices[j]
                             tensor_index = node.tensor_indices[j]
-                            shape_key = inbound_layer.name + '_%s_%s' % (node_index, tensor_index)
+                            shape_key = inbound_layer.name
+                            shape_key += '_%s_%s' % (node_index, tensor_index)
                             input_shape = layers_to_output_shapes[shape_key]
                             input_shapes.append(input_shape)
 
-                        output_shape = layer.compute_output_shape(unpack_singleton(input_shapes))
+                        output_shape = layer.compute_output_shape(
+                            unpack_singleton(input_shapes))
 
                         output_shapes = to_list(output_shape)
                         node_index = layer._inbound_nodes.index(node)
@@ -721,7 +717,8 @@ def run_internal_graph(self, inputs, masks=None):
                             if has_arg(layer.call, 'mask'):
                                 if 'mask' not in kwargs:
                                     kwargs['mask'] = computed_mask
-                            output_tensors = to_list(layer.call(computed_tensor, **kwargs))
+                            output_tensors = to_list(
+                                layer.call(computed_tensor, **kwargs))
                             output_masks = layer.compute_mask(computed_tensor,
                                                               computed_mask)
                             if output_masks is None:
@@ -747,7 +744,8 @@ def run_internal_graph(self, inputs, masks=None):
                             else:
                                 output_masks = to_list(output_masks)
                         # Apply activity regularizer if any:
-                        if hasattr(layer, 'activity_regularizer') and layer.activity_regularizer is not None:
+                        if (hasattr(layer, 'activity_regularizer') and
+                                layer.activity_regularizer is not None):
                             with K.name_scope('activity_regularizer'):
                                 regularization_losses = [
                                     layer.activity_regularizer(x)
@@ -759,8 +757,8 @@ def run_internal_graph(self, inputs, masks=None):
                             raise Exception(
                                 'Layers should have equal number of output tensors '
                                 'and output masks. Layer ' + str(layer.name) + ' has'
-                                ' ' + str(len(output_tensors)) + ' output tensors and'
-                                ' ' + str(len(output_masks)) + ' output masks.')
+                                ' ' + str(len(output_tensors)) + ' output tensors '
+                                'and ' + str(len(output_masks)) + ' output masks.')
                     # Update model updates and losses:
                     # Keep track of updates that depend on the inputs
                     # (e.g. BN updates).
@@ -776,16 +774,21 @@ def run_internal_graph(self, inputs, masks=None):
 
                     # Update _keras_shape.
                     if all([hasattr(x, '_keras_shape') for x in computed_tensors]):
-                        input_shapes = unpack_singleton([x._keras_shape for x in computed_tensors])
+                        input_shapes = unpack_singleton(
+                            [x._keras_shape for x in computed_tensors])
                         shapes = to_list(layer.compute_output_shape(input_shapes))
-                        uses_learning_phase = any([x._uses_learning_phase for x in computed_tensors])
+                        uses_learning_phase = any(
+                            [x._uses_learning_phase for x in computed_tensors])
 
                         for x, s in zip(output_tensors, shapes):
                             x._keras_shape = s
-                            x._uses_learning_phase = getattr(x, '_uses_learning_phase', False) or uses_learning_phase
+                            _u = getattr(x, '_uses_learning_phase', False)
+                            x._uses_learning_phase = _u or uses_learning_phase
 
                     # Update tensor_map.
-                    for x, y, mask in zip(reference_output_tensors, output_tensors, output_masks):
+                    for x, y, mask in zip(reference_output_tensors,
+                                          output_tensors,
+                                          output_masks):
                         tensor_map[str(id(x))] = (y, mask)
 
         output_tensors = []
@@ -1246,15 +1249,23 @@ def summary(self, line_length=None, positions=None, print_fn=None):
         """
         if not self.built:
             raise ValueError(
-                'This model has never been called, thus its weights '
-                'have not yet been created, so no summary can be displayed. '
-                'Build the model first '
-                '(e.g. by calling it on some test data).')
+                'This model has not yet been built. '
+                'Build the model first by calling build() '
+                'or calling fit() with some data. '
+                'Or specify input_shape or batch_input_shape '
+                'in the first layer for automatic build. ')
         return print_layer_summary(self,
                                    line_length=line_length,
                                    positions=positions,
                                    print_fn=print_fn)
 
+    def __getstate__(self):
+        return saving.pickle_model(self)
+
+    def __setstate__(self, state):
+        model = saving.unpickle_model(state)
+        self.__dict__.update(model.__dict__)
+
 
 def _make_node_key(layer_name, node_index):
     return layer_name + '_ib-' + str(node_index)
diff --git a/keras/engine/saving.py b/keras/engine/saving.py
index 007fd4249a3..6eaa3a81c79 100644
--- a/keras/engine/saving.py
+++ b/keras/engine/saving.py
@@ -14,6 +14,7 @@
 from .. import backend as K
 from .. import optimizers
 from ..utils.io_utils import ask_to_proceed_with_overwrite
+from ..utils.io_utils import h5dict
 from ..utils import conv_utils
 
 try:
@@ -100,41 +101,20 @@ def save_mxnet_model(model, prefix, epoch=0):
     return data_names, data_shapes
 
 
-def save_model(model, filepath, overwrite=True, include_optimizer=True):
-    """Save a model to a HDF5 file.
-
-    Note: Please also see
-    [How can I install HDF5 or h5py to save my models in Keras?](
-        /getting-started/faq/
-        #how-can-i-install-HDF5-or-h5py-to-save-my-models-in-Keras)
-    in the FAQ for instructions on how to install `h5py`.
-
-    The saved model contains:
-        - the model's configuration (topology)
-        - the model's weights
-        - the model's optimizer's state (if any)
+def _serialize_model(model, f, include_optimizer=True):
+    """Model serialization logic.
 
-    Thus the saved model can be reinstantiated in
-    the exact same state, without any of the code
-    used for model definition or training.
+    This method is used for both writing to HDF5 file/group,
+    as well as pickling. This is achieved via a
+    `keras.utils.hdf5_utls.H5Dict` object, which can wrap HDF5
+    files, groups and dicts with a common API.
 
     # Arguments
-        model: Keras model instance to be saved.
-        filepath: one of the following:
-            - string, path where to save the model, or
-            - h5py.File object where to save the model
-        overwrite: Whether we should overwrite any existing
-            model at the target location, or instead
-            ask the user with a manual prompt.
-        include_optimizer: If True, save optimizer's state together.
+        model: Keras model instance to be serialized.
+        f: keras.utils.io_utils.HD5Dict instance.
+        include_optimizer: If True, serialize optimizer's state together.
 
-    # Raises
-        ImportError: if h5py is not available.
     """
-
-    if h5py is None:
-        raise ImportError('`save_model` requires h5py.')
-
     def get_json_type(obj):
         """Serialize any object to a JSON-serializable structure.
 
@@ -168,110 +148,111 @@ def get_json_type(obj):
         if type(obj).__name__ == type.__name__:
             return obj.__name__
 
-        raise TypeError('Not JSON Serializable:', obj)
+        raise TypeError('Not JSON Serializable: %s' % (obj,))
 
     from .. import __version__ as keras_version
 
-    if not isinstance(filepath, h5py.File):
-        # If file exists and should not be overwritten.
-        if not overwrite and os.path.isfile(filepath):
-            proceed = ask_to_proceed_with_overwrite(filepath)
-            if not proceed:
-                return
-
-        f = h5py.File(filepath, mode='w')
-        opened_new_file = True
-    else:
-        f = filepath
-        opened_new_file = False
-
-    try:
-        f.attrs['keras_version'] = str(keras_version).encode('utf8')
-        f.attrs['backend'] = K.backend().encode('utf8')
-        f.attrs['model_config'] = json.dumps({
-            'class_name': model.__class__.__name__,
-            'config': model.get_config()
-        }, default=get_json_type).encode('utf8')
-
-        model_weights_group = f.create_group('model_weights')
-        model_layers = model.layers
-        save_weights_to_hdf5_group(model_weights_group, model_layers)
-
-        if include_optimizer and model.optimizer:
-            if isinstance(model.optimizer, optimizers.TFOptimizer):
-                warnings.warn(
-                    'TensorFlow optimizers do not '
-                    'make it possible to access '
-                    'optimizer attributes or optimizer state '
-                    'after instantiation. '
-                    'As a result, we cannot save the optimizer '
-                    'as part of the model save file.'
-                    'You will have to compile your model again '
-                    'after loading it. '
-                    'Prefer using a Keras optimizer instead '
-                    '(see keras.io/optimizers).')
+    f['keras_version'] = str(keras_version).encode('utf8')
+    f['backend'] = K.backend().encode('utf8')
+
+    model_config = {}
+    model_config['class_name'] = model.__class__.__name__
+    model_config['config'] = model.get_config()
+    model_config = json.dumps(model_config, default=get_json_type)
+    model_config = model_config.encode('utf-8')
+    f['model_config'] = model_config
+
+    model_weights_group = f['model_weights']
+    model_layers = model.layers
+    model_weights_group['layer_names'] = [layer.name.encode('utf8')
+                                          for layer in model_layers]
+    model_weights_group['backend'] = K.backend().encode('utf8')
+    model_weights_group['keras_version'] = str(keras_version).encode('utf8')
+    for layer in model_layers:
+        layer_group = model_weights_group[layer.name]
+        symbolic_weights = layer.weights
+        weight_values = K.batch_get_value(symbolic_weights)
+        weight_names = []
+        for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
+            if hasattr(w, 'name') and w.name:
+                name = str(w.name)
             else:
-                f.attrs['training_config'] = json.dumps({
-                    'optimizer_config': {
-                        'class_name': model.optimizer.__class__.__name__,
-                        'config': model.optimizer.get_config()
-                    },
-                    'loss': model.loss,
-                    'metrics': model.metrics,
-                    'sample_weight_mode': model.sample_weight_mode,
-                    'loss_weights': model.loss_weights,
-                }, default=get_json_type).encode('utf8')
-
-                # Save optimizer weights.
-                symbolic_weights = getattr(model.optimizer, 'weights')
-                if symbolic_weights:
-                    optimizer_weights_group = f.create_group(
-                        'optimizer_weights')
-                    weight_values = K.batch_get_value(symbolic_weights)
-                    weight_names = []
-                    for i, (w, val) in enumerate(zip(symbolic_weights,
-                                                     weight_values)):
-                        # Default values of symbolic_weights is /variable
-                        # for Theano and CNTK
-                        if K.backend() == 'theano' or K.backend() == 'cntk':
-                            if hasattr(w, 'name'):
-                                if w.name.split('/')[-1] == 'variable':
-                                    name = str(w.name) + '_' + str(i)
-                                else:
-                                    name = str(w.name)
+                name = 'param_' + str(i)
+            if name in weight_names:
+                idx = 2
+                unique_name = name + '_1'
+                while unique_name in weight_names:
+                    unique_name = name + '_' + str(idx)
+                    idx += 1
+                name = unique_name
+            weight_names.append(name.encode('utf8'))
+        layer_group['weight_names'] = weight_names
+        for name, val in zip(weight_names, weight_values):
+            layer_group[name] = val
+    if include_optimizer and model.optimizer:
+        if isinstance(model.optimizer, optimizers.TFOptimizer):
+            warnings.warn(
+                'TensorFlow optimizers do not '
+                'make it possible to access '
+                'optimizer attributes or optimizer state '
+                'after instantiation. '
+                'As a result, we cannot save the optimizer '
+                'as part of the model save file.'
+                'You will have to compile your model again '
+                'after loading it. '
+                'Prefer using a Keras optimizer instead '
+                '(see keras.io/optimizers).')
+        else:
+            f['training_config'] = json.dumps({
+                'optimizer_config': {
+                    'class_name': model.optimizer.__class__.__name__,
+                    'config': model.optimizer.get_config()
+                },
+                'loss': model.loss,
+                'metrics': model.metrics,
+                'sample_weight_mode': model.sample_weight_mode,
+                'loss_weights': model.loss_weights,
+            }, default=get_json_type).encode('utf8')
+            symbolic_weights = getattr(model.optimizer, 'weights')
+            if symbolic_weights:
+                optimizer_weights_group = f['optimizer_weights']
+                weight_values = K.batch_get_value(symbolic_weights)
+                weight_names = []
+                for i, (w, val) in enumerate(zip(symbolic_weights,
+                                                 weight_values)):
+                    # Default values of symbolic_weights is /variable
+                    # for Theano and CNTK
+                    if K.backend() == 'theano' or K.backend() == 'cntk':
+                        if hasattr(w, 'name'):
+                            if w.name.split('/')[-1] == 'variable':
+                                name = str(w.name) + '_' + str(i)
                             else:
-                                name = 'param_' + str(i)
-                        else:
-                            if hasattr(w, 'name') and w.name:
                                 name = str(w.name)
-                            else:
-                                name = 'param_' + str(i)
-                        weight_names.append(name.encode('utf8'))
-                    optimizer_weights_group.attrs[
-                        'weight_names'] = weight_names
-                    for name, val in zip(weight_names, weight_values):
-                        param_dset = optimizer_weights_group.create_dataset(
-                            name,
-                            val.shape,
-                            dtype=val.dtype)
-                        if not val.shape:
-                            # scalar
-                            param_dset[()] = val
                         else:
-                            param_dset[:] = val
-        f.flush()
-    finally:
-        if opened_new_file:
-            f.close()
-
-
-def load_model(filepath, custom_objects=None, compile=True):
-    """Loads a model saved via `save_model`.
+                            name = 'param_' + str(i)
+                    else:
+                        if hasattr(w, 'name') and w.name:
+                            name = str(w.name)
+                        else:
+                            name = 'param_' + str(i)
+                    if name in weight_names:
+                        idx = 2
+                        unique_name = name + '_1'
+                        while unique_name in weight_names:
+                            unique_name = name + '_' + str(idx)
+                            idx += 1
+                        name = unique_name
+                    weight_names.append(name.encode('utf8'))
+                optimizer_weights_group['weight_names'] = weight_names
+                for name, val in zip(weight_names, weight_values):
+                    optimizer_weights_group[name] = val
+
+
+def _deserialize_model(f, custom_objects=None, compile=True):
+    """De-serializes a model serialized via _serialize_model
 
     # Arguments
-        filepath: one of the following:
-            - string, path to the saved model, or
-            - h5py.File object from which to load the model
+        f: `keras.utils.hdf5_utils.HFDict` instance.
         custom_objects: Optional dictionary mapping names
             (strings) to custom classes or functions to be
             considered during deserialization.
@@ -285,14 +266,7 @@ def load_model(filepath, custom_objects=None, compile=True):
         a warning will be displayed. When `compile` is set
         to False, the compilation is omitted without any
         warning.
-
-    # Raises
-        ImportError: if h5py is not available.
-        ValueError: In case of an invalid savefile.
     """
-    if h5py is None:
-        raise ImportError('`load_model` requires h5py.')
-
     if not custom_objects:
         custom_objects = {}
 
@@ -321,73 +295,223 @@ def convert_custom_objects(obj):
             return custom_objects[obj]
         return obj
 
-    opened_new_file = not isinstance(filepath, h5py.File)
-    if opened_new_file:
-        f = h5py.File(filepath, mode='r')
+    model_config = f['model_config']
+    if model_config is None:
+        raise ValueError('No model found in config.')
+    model_config = json.loads(model_config.decode('utf-8'))
+    model = model_from_config(model_config, custom_objects=custom_objects)
+    model_weights_group = f['model_weights']
+
+    if 'keras_version' in model_weights_group:
+        original_keras_version = model_weights_group['keras_version'].decode('utf8')
+    else:
+        original_keras_version = '1'
+    if 'backend' in model_weights_group:
+        original_backend = model_weights_group['backend'].decode('utf8')
+    else:
+        original_backend = None
+
+    layer_names = model_weights_group['layer_names']
+
+    layers = model.layers
+
+    filtered_layers = []
+    for layer in layers:
+        weights = layer.weights
+        if weights:
+            filtered_layers.append(layer)
+
+    filtered_layer_names = []
+    for name in layer_names:
+        layer_weights = model_weights_group[name]
+        weight_names = layer_weights['weight_names']
+        if weight_names:
+            filtered_layer_names.append(name)
+
+    layer_names = filtered_layer_names
+    if len(layer_names) != len(filtered_layers):
+        raise ValueError('You are trying to load a weight file'
+                         ' containing {} layers into a model with {} layers'
+                         .format(len(layer_names), len(filtered_layers))
+                         )
+
+    # We batch weight value assignments in a single backend call
+    # which provides a speedup in TensorFlow.
+    weight_value_tuples = []
+    for k, name in enumerate(layer_names):
+        layer_weights = model_weights_group[name]
+        weight_names = layer_weights['weight_names']
+        weight_values = [layer_weights[weight_name] for weight_name in weight_names]
+        layer = filtered_layers[k]
+        symbolic_weights = layer.weights
+        weight_values = preprocess_weights_for_loading(layer,
+                                                       weight_values,
+                                                       original_keras_version,
+                                                       original_backend,
+                                                       reshape=False)
+        if len(weight_values) != len(symbolic_weights):
+            raise ValueError('Layer #' + str(k) +
+                             ' (named "' + layer.name +
+                             '" in the current model) was found to '
+                             'correspond to layer ' + name +
+                             ' in the save file. '
+                             'However the new layer ' + layer.name +
+                             ' expects ' + str(len(symbolic_weights)) +
+                             ' weights, but the saved weights have ' +
+                             str(len(weight_values)) +
+                             ' elements.')
+        weight_value_tuples += zip(symbolic_weights, weight_values)
+    K.batch_set_value(weight_value_tuples)
+
+    if compile:
+        training_config = f.get('training_config')
+        if training_config is None:
+            warnings.warn('No training configuration found in save file: '
+                          'the model was *not* compiled. '
+                          'Compile it manually.')
+            return model
+        training_config = json.loads(training_config.decode('utf-8'))
+        optimizer_config = training_config['optimizer_config']
+        optimizer = optimizers.deserialize(optimizer_config,
+                                           custom_objects=custom_objects)
+
+        # Recover loss functions and metrics.
+        loss = convert_custom_objects(training_config['loss'])
+        metrics = convert_custom_objects(training_config['metrics'])
+        sample_weight_mode = training_config['sample_weight_mode']
+        loss_weights = training_config['loss_weights']
+
+        # Compile model.
+        model.compile(optimizer=optimizer,
+                      loss=loss,
+                      metrics=metrics,
+                      loss_weights=loss_weights,
+                      sample_weight_mode=sample_weight_mode)
+
+        # Set optimizer weights.
+        if 'optimizer_weights' in f:
+            # Build train function (to get weight updates).
+            model._make_train_function()
+            optimizer_weights_group = f['optimizer_weights']
+            optimizer_weight_names = [
+                n.decode('utf8') for n in
+                optimizer_weights_group['weight_names']]
+            optimizer_weight_values = [optimizer_weights_group[n] for n in
+                                       optimizer_weight_names]
+            try:
+                model.optimizer.set_weights(optimizer_weight_values)
+            except ValueError:
+                warnings.warn('Error in loading the saved optimizer '
+                              'state. As a result, your model is '
+                              'starting with a freshly initialized '
+                              'optimizer.')
+
+    return model
+
+
+def save_model(model, filepath, overwrite=True, include_optimizer=True):
+    """Save a model to a HDF5 file.
+
+    Note: Please also see
+    [How can I install HDF5 or h5py to save my models in Keras?](
+        /getting-started/faq/
+        #how-can-i-install-HDF5-or-h5py-to-save-my-models-in-Keras)
+    in the FAQ for instructions on how to install `h5py`.
+
+    The saved model contains:
+        - the model's configuration (topology)
+        - the model's weights
+        - the model's optimizer's state (if any)
+
+    Thus the saved model can be reinstantiated in
+    the exact same state, without any of the code
+    used for model definition or training.
+
+    # Arguments
+        model: Keras model instance to be saved.
+        filepath: one of the following:
+            - string, path where to save the model, or
+            - h5py.File or h5py.Group object where to save the model
+        overwrite: Whether we should overwrite any existing
+            model at the target location, or instead
+            ask the user with a manual prompt.
+        include_optimizer: If True, save optimizer's state together.
+
+    # Raises
+        ImportError: if h5py is not available.
+    """
+    if h5py is None:
+        raise ImportError('`save_model` requires h5py.')
+
+    if not isinstance(filepath, h5py.Group):
+        # If file exists and should not be overwritten.
+        if not overwrite and os.path.isfile(filepath):
+            proceed = ask_to_proceed_with_overwrite(filepath)
+            if not proceed:
+                return
+        opened_new_file = True
     else:
-        f = filepath
+        opened_new_file = False
+
+    f = h5dict(filepath, mode='w')
+
+    try:
+        _serialize_model(model, f, include_optimizer)
+    finally:
+        if opened_new_file:
+            f.close()
+
+
+def load_model(filepath, custom_objects=None, compile=True):
+    """Loads a model saved via `save_model`.
+
+    # Arguments
+        filepath: one of the following:
+            - string, path to the saved model, or
+            - h5py.File or h5py.Group object from which to load the model
+        custom_objects: Optional dictionary mapping names
+            (strings) to custom classes or functions to be
+            considered during deserialization.
+        compile: Boolean, whether to compile the model
+            after loading.
 
+    # Returns
+        A Keras model instance. If an optimizer was found
+        as part of the saved model, the model is already
+        compiled. Otherwise, the model is uncompiled and
+        a warning will be displayed. When `compile` is set
+        to False, the compilation is omitted without any
+        warning.
+
+    # Raises
+        ImportError: if h5py is not available.
+        ValueError: In case of an invalid savefile.
+    """
+    if h5py is None:
+        raise ImportError('`load_model` requires h5py.')
     model = None
+    opened_new_file = not isinstance(filepath, h5py.Group)
+    f = h5dict(filepath, 'r')
     try:
-        # instantiate model
-        model_config = f.attrs.get('model_config')
-        if model_config is None:
-            raise ValueError('No model found in config file.')
-        model_config = json.loads(model_config.decode('utf-8'))
-        model = model_from_config(model_config, custom_objects=custom_objects)
-
-        # set weights
-        load_weights_from_hdf5_group(f['model_weights'], model.layers)
-
-        if compile:
-            # instantiate optimizer
-            training_config = f.attrs.get('training_config')
-            if training_config is None:
-                warnings.warn('No training configuration found in save file: '
-                              'the model was *not* compiled. '
-                              'Compile it manually.')
-                return model
-            training_config = json.loads(training_config.decode('utf-8'))
-            optimizer_config = training_config['optimizer_config']
-            optimizer = optimizers.deserialize(optimizer_config,
-                                               custom_objects=custom_objects)
-
-            # Recover loss functions and metrics.
-            loss = convert_custom_objects(training_config['loss'])
-            metrics = convert_custom_objects(training_config['metrics'])
-            sample_weight_mode = training_config['sample_weight_mode']
-            loss_weights = training_config['loss_weights']
-
-            # Compile model.
-            model.compile(optimizer=optimizer,
-                          loss=loss,
-                          metrics=metrics,
-                          loss_weights=loss_weights,
-                          sample_weight_mode=sample_weight_mode)
-
-            # Set optimizer weights.
-            if 'optimizer_weights' in f:
-                # Build train function (to get weight updates).
-                model._make_train_function()
-                optimizer_weights_group = f['optimizer_weights']
-                optimizer_weight_names = [
-                    n.decode('utf8') for n in
-                    optimizer_weights_group.attrs['weight_names']]
-                optimizer_weight_values = [optimizer_weights_group[n] for n in
-                                           optimizer_weight_names]
-                try:
-                    model.optimizer.set_weights(optimizer_weight_values)
-                except ValueError:
-                    warnings.warn('Error in loading the saved optimizer '
-                                  'state. As a result, your model is '
-                                  'starting with a freshly initialized '
-                                  'optimizer.')
+        model = _deserialize_model(f, custom_objects, compile)
     finally:
         if opened_new_file:
             f.close()
     return model
 
 
+def pickle_model(model):
+    d = {}
+    f = h5dict(d)
+    _serialize_model(model, f)
+    return d
+
+
+def unpickle_model(state):
+    f = h5dict(state, mode='r')
+    return _deserialize_model(f)
+
+
 def model_from_config(config, custom_objects=None):
     """Instantiates a Keras model from its config.
 
@@ -545,7 +669,7 @@ def preprocess_weights_for_loading(layer, weights,
                                    original_keras_version=None,
                                    original_backend=None,
                                    reshape=False):
-    """Converts layers weights from Keras 1 format to Keras 2 and also weights of CuDNN layers in Keras 2.
+    """Converts layers weights from Keras 1 format to Keras 2.
 
     # Arguments
         layer: Layer instance.
@@ -560,7 +684,7 @@ def preprocess_weights_for_loading(layer, weights,
         A list of weights values (Numpy arrays).
     """
     def convert_nested_bidirectional(weights):
-        """Converts layers nested in `Bidirectional` wrapper by `preprocess_weights_for_loading()`.
+        """Converts layers nested in `Bidirectional` wrapper.
 
         # Arguments
             weights: List of weights values (Numpy arrays).
@@ -568,18 +692,20 @@ def convert_nested_bidirectional(weights):
             A list of weights values (Numpy arrays).
         """
         num_weights_per_layer = len(weights) // 2
-        forward_weights = preprocess_weights_for_loading(layer.forward_layer,
-                                                         weights[:num_weights_per_layer],
-                                                         original_keras_version,
-                                                         original_backend)
-        backward_weights = preprocess_weights_for_loading(layer.backward_layer,
-                                                          weights[num_weights_per_layer:],
-                                                          original_keras_version,
-                                                          original_backend)
+        forward_weights = preprocess_weights_for_loading(
+            layer.forward_layer,
+            weights[:num_weights_per_layer],
+            original_keras_version,
+            original_backend)
+        backward_weights = preprocess_weights_for_loading(
+            layer.backward_layer,
+            weights[num_weights_per_layer:],
+            original_keras_version,
+            original_backend)
         return forward_weights + backward_weights
 
     def convert_nested_time_distributed(weights):
-        """Converts layers nested in `TimeDistributed` wrapper by `preprocess_weights_for_loading()`.
+        """Converts layers nested in `TimeDistributed` wrapper.
 
         # Arguments
             weights: List of weights values (Numpy arrays).
@@ -590,7 +716,7 @@ def convert_nested_time_distributed(weights):
             layer.layer, weights, original_keras_version, original_backend)
 
     def convert_nested_model(weights):
-        """Converts layers nested in `Model` or `Sequential` by `preprocess_weights_for_loading()`.
+        """Converts layers nested in `Model` or `Sequential`.
 
         # Arguments
             weights: List of weights values (Numpy arrays).
@@ -645,7 +771,8 @@ def convert_nested_model(weights):
             if shape[:2] != (layer.kernel_size[0], 1) or shape[3] != layer.filters:
                 # Legacy shape:
                 # (filters, input_dim, filter_length, 1)
-                assert shape[0] == layer.filters and shape[2:] == (layer.kernel_size[0], 1)
+                assert (shape[0] == layer.filters and
+                        shape[2:] == (layer.kernel_size[0], 1))
                 weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
             weights[0] = weights[0][:, 0, :, :]
 
@@ -862,14 +989,17 @@ def convert_weights(weights, from_cudnn=True):
     # convert the weights between CuDNNGRU and GRU(reset_after=True)
     if target_class in ['GRU', 'CuDNNGRU'] and len(weights) == 3:
         # We can determine the source of the weights from the shape of the bias.
-        # If there is no bias we skip the conversion since CuDNNGRU always has biases.
+        # If there is no bias we skip the conversion
+        # since CuDNNGRU always has biases.
 
         units = weights[1].shape[0]
         bias_shape = weights[2].shape
         n_gates = 3
 
         def convert_weights(weights, from_cudnn=True):
-            kernels = transform_kernels(weights[0], transpose_input(from_cudnn), n_gates)
+            kernels = transform_kernels(weights[0],
+                                        transpose_input(from_cudnn),
+                                        n_gates)
             recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
             biases = np.array(weights[2]).reshape((2, -1) if from_cudnn else -1)
             return [kernels, recurrent_kernels, biases]
@@ -908,7 +1038,7 @@ def _need_convert_kernel(original_backend):
 
     The convolution operation is implemented differently in different backends.
     While TH implements convolution, TF and CNTK implement the correlation operation.
-    So the channel axis needs to be flipped when we're loading TF weights onto a TH model,
+    So the channel axis needs to be flipped when TF weights are loaded on a TH model,
     or vice versa. However, there's no conversion required between TF and CNTK.
 
     # Arguments
@@ -1062,9 +1192,10 @@ def load_weights_from_hdf5_group_by_name(f, layers, skip_mismatch=False,
                 reshape=reshape)
             if len(weight_values) != len(symbolic_weights):
                 if skip_mismatch:
-                    warnings.warn('Skipping loading of weights for layer {}'.format(layer.name) +
-                                  ' due to mismatch in number of weights' +
-                                  ' ({} vs {}).'.format(len(symbolic_weights), len(weight_values)))
+                    warnings.warn('Skipping loading of weights for '
+                                  'layer {}'.format(layer.name) + ' due to mismatch '
+                                  'in number of weights ({} vs {}).'.format(
+                                      len(symbolic_weights), len(weight_values)))
                     continue
                 else:
                     raise ValueError('Layer #' + str(k) +
@@ -1076,11 +1207,12 @@ def load_weights_from_hdf5_group_by_name(f, layers, skip_mismatch=False,
                                      ' element(s).')
             # Set values.
             for i in range(len(weight_values)):
-                if K.int_shape(symbolic_weights[i]) != weight_values[i].shape:
+                symbolic_shape = K.int_shape(symbolic_weights[i])
+                if symbolic_shape != weight_values[i].shape:
                     if skip_mismatch:
-                        warnings.warn('Skipping loading of weights for layer {}'.format(layer.name) +
-                                      ' due to mismatch in shape' +
-                                      ' ({} vs {}).'.format(
+                        warnings.warn('Skipping loading of weights for '
+                                      'layer {}'.format(layer.name) + ' due to '
+                                      'mismatch in shape ({} vs {}).'.format(
                                           symbolic_weights[i].shape,
                                           weight_values[i].shape))
                         continue
@@ -1089,7 +1221,7 @@ def load_weights_from_hdf5_group_by_name(f, layers, skip_mismatch=False,
                                          ' (named "' + layer.name +
                                          '"), weight ' +
                                          str(symbolic_weights[i]) +
-                                         ' has shape {}'.format(K.int_shape(symbolic_weights[i])) +
+                                         ' has shape {}'.format(symbolic_shape) +
                                          ', but the saved weight has shape ' +
                                          str(weight_values[i].shape) + '.')
                 else:
diff --git a/keras/engine/sequential.py b/keras/engine/sequential.py
index e75a68687be..bf85836f063 100644
--- a/keras/engine/sequential.py
+++ b/keras/engine/sequential.py
@@ -85,6 +85,7 @@ class Sequential(Model):
 
     def __init__(self, layers=None, name=None):
         super(Sequential, self).__init__(name=name)
+        self._build_input_shape = None
 
         # Add to the model any layers passed to the constructor.
         if layers:
@@ -219,8 +220,7 @@ def build(self, input_shape=None):
             for layer in self._layers:
                 x = layer(x)
             self.outputs = [x]
-            if self._layers:
-                self._layers[0].batch_input_shape = batch_shape
+            self._build_input_shape = input_shape
 
         if self.inputs:
             self._init_graph_network(self.inputs,
@@ -271,21 +271,36 @@ def predict_classes(self, x, batch_size=32, verbose=0):
             return (proba > 0.5).astype('int32')
 
     def get_config(self):
-        config = []
+        layer_configs = []
         for layer in self.layers:
-            config.append({
+            layer_configs.append({
                 'class_name': layer.__class__.__name__,
                 'config': layer.get_config()
             })
-        return copy.deepcopy(config)
+        config = {
+            'name': self.name,
+            'layers': copy.deepcopy(layer_configs)
+        }
+        if self._build_input_shape:
+            config['build_input_shape'] = self._build_input_shape
+        return config
 
     @classmethod
     def from_config(cls, config, custom_objects=None):
-        model = cls()
-        for conf in config:
+        if 'name' in config:
+            name = config['name']
+            build_input_shape = config.get('build_input_shape')
+            layer_configs = config['layers']
+        else:  # legacy config file
+            name = build_input_shape = None
+            layer_configs = config
+        model = cls(name=name)
+        for conf in layer_configs:
             layer = layer_module.deserialize(conf,
                                              custom_objects=custom_objects)
             model.add(layer)
+        if not model.inputs and build_input_shape:
+            model.build(build_input_shape)
         return model
 
 # We need to overload the Keras Model class to handle MXNet model building activities.
diff --git a/keras/engine/training.py b/keras/engine/training.py
index feb082f80b1..9cdb5a06cf4 100644
--- a/keras/engine/training.py
+++ b/keras/engine/training.py
@@ -207,9 +207,18 @@ def compile(self, optimizer,
                 for name in self.output_names:
                     tmp_target_tensors.append(target_tensors.get(name, None))
                 target_tensors = tmp_target_tensors
+            elif K.is_tensor(target_tensors):
+                if len(self.outputs) != 1:
+                    raise ValueError('The model has ' + str(len(self.outputs)) +
+                                     ' outputs, but you passed a single tensor as '
+                                     '`target_tensors`. Expected a list or a dict '
+                                     'of tensors.')
+                target_tensors = [target_tensors]
             else:
-                raise TypeError('Expected `target_tensors` to be '
-                                'a list or dict, but got:', target_tensors)
+                raise TypeError('Expected `target_tensors` to be a tensor, '
+                                'a list of tensors, or dict of tensors, but got:',
+                                target_tensors)
+
         for i in range(len(self.outputs)):
             if i in skip_target_indices:
                 self.targets.append(None)
@@ -374,7 +383,8 @@ def handle_metrics(metrics, weights=None):
                             metric_fn = metrics_module.binary_accuracy
                         elif metric in ('crossentropy', 'ce'):
                             metric_fn = metrics_module.binary_crossentropy
-                    elif self.loss_functions[i] == losses.sparse_categorical_crossentropy:
+                    elif (self.loss_functions[i] ==
+                          losses.sparse_categorical_crossentropy):
                         # case: categorical accuracy/crossentropy
                         # with sparse targets
                         if metric in ('accuracy', 'acc'):
@@ -569,7 +579,8 @@ def _set_inputs(self, inputs, outputs=None, training=None):
               when calling `fit`/etc.
             - if data tensors: the model is built on top of these tensors.
               We do not expect any Numpy data to be provided when calling `fit`/etc.
-          outputs: Optional output tensors (if already computed by running the model).
+          outputs: Optional output tensors (if already computed by running
+            the model).
           training: Boolean or None. Only relevant in symbolic mode. Specifies
             whether to build the model's graph in inference mode (False), training
             mode (True), or using the Keras learning phase (None).
@@ -595,10 +606,7 @@ def _set_inputs(self, inputs, outputs=None, training=None):
         self._feed_inputs = []
         self._feed_input_names = []
         self._feed_input_shapes = []
-        if isinstance(inputs, (list, tuple)):
-            inputs = list(inputs)
-        else:
-            inputs = [inputs]
+        inputs = to_list(inputs, allow_tuple=True)
 
         for i, v in enumerate(inputs):
             name = 'input_%d' % (i + 1)
@@ -632,10 +640,7 @@ def _set_inputs(self, inputs, outputs=None, training=None):
                 outputs = self.call(unpack_singleton(self.inputs), training=training)
             else:
                 outputs = self.call(unpack_singleton(self.inputs))
-        if isinstance(outputs, (list, tuple)):
-            outputs = list(outputs)
-        else:
-            outputs = [outputs]
+        outputs = to_list(outputs, allow_tuple=True)
         self.outputs = outputs
         self.output_names = [
             'output_%d' % (i + 1) for i in range(len(self.outputs))]
@@ -703,10 +708,7 @@ def _standardize_user_data(self, x,
                                          'You passed: y=' + str(y))
                 # Typecheck that all inputs are *either* value *or* symbolic.
                 if y is not None:
-                    if isinstance(y, (list, tuple)):
-                        all_inputs += list(y)
-                    else:
-                        all_inputs.append(y)
+                    all_inputs += to_list(y, allow_tuple=True)
                 if any(K.is_tensor(v) for v in all_inputs):
                     if not all(K.is_tensor(v) for v in all_inputs):
                         raise ValueError('Do not pass inputs that mix Numpy '
@@ -715,8 +717,7 @@ def _standardize_user_data(self, x,
                                          '; y=' + str(y))
 
                 # Handle target tensors if any passed.
-                if not isinstance(y, (list, tuple)):
-                    y = [y]
+                y = to_list(y, allow_tuple=True)
                 target_tensors = [v for v in y if K.is_tensor(v)]
                 if not target_tensors:
                     target_tensors = None
@@ -985,9 +986,9 @@ def fit(self,
                 sample_weight=val_sample_weight,
                 batch_size=batch_size)
             if self._uses_dynamic_learning_phase():
-                val_ins = val_x + val_y + val_sample_weights + [0.]
+                val_inputs = val_x + val_y + val_sample_weights + [0.]
             else:
-                val_ins = val_x + val_y + val_sample_weights
+                val_inputs = val_x + val_y + val_sample_weights
 
         elif validation_split and 0. < validation_split < 1.:
             if any(K.is_tensor(t) for t in x):
@@ -1007,45 +1008,45 @@ def fit(self,
                 slice_arrays(sample_weights, 0, split_at),
                 slice_arrays(sample_weights, split_at))
             if self._uses_dynamic_learning_phase():
-                val_ins = val_x + val_y + val_sample_weights + [0.]
+                val_inputs = val_x + val_y + val_sample_weights + [0.]
             else:
-                val_ins = val_x + val_y + val_sample_weights
+                val_inputs = val_x + val_y + val_sample_weights
 
         elif validation_steps:
             do_validation = True
             if self._uses_dynamic_learning_phase():
-                val_ins = [0.]
+                val_inputs = [0.]
 
         # Prepare input arrays and training function.
         if self._uses_dynamic_learning_phase():
-            ins = x + y + sample_weights + [1.]
+            fit_inputs = x + y + sample_weights + [1.]
         else:
-            ins = x + y + sample_weights
+            fit_inputs = x + y + sample_weights
         self._make_train_function()
-        f = self.train_function
+        fit_function = self.train_function
 
         # Prepare display labels.
         out_labels = self.metrics_names
 
         if do_validation:
             self._make_test_function()
-            val_f = self.test_function
+            val_function = self.test_function
             callback_metrics = copy.copy(out_labels) + [
                 'val_' + n for n in out_labels]
         else:
             callback_metrics = copy.copy(out_labels)
-            val_f = None
-            val_ins = []
+            val_function = None
+            val_inputs = []
 
         # Delegate logic to `fit_loop`.
-        return training_arrays.fit_loop(self, f, ins,
+        return training_arrays.fit_loop(self, fit_function, fit_inputs,
                                         out_labels=out_labels,
                                         batch_size=batch_size,
                                         epochs=epochs,
                                         verbose=verbose,
                                         callbacks=callbacks,
-                                        val_f=val_f,
-                                        val_ins=val_ins,
+                                        val_function=val_function,
+                                        val_inputs=val_inputs,
                                         shuffle=shuffle,
                                         callback_metrics=callback_metrics,
                                         initial_epoch=initial_epoch,
@@ -1302,7 +1303,8 @@ def fit_generator(self, generator,
                       use_multiprocessing=False,
                       shuffle=True,
                       initial_epoch=0):
-        """Trains the model on data generated batch-by-batch by a Python generator (or an instance of `Sequence`).
+        """Trains the model on data generated batch-by-batch by a Python generator
+        (or an instance of `Sequence`).
 
         The generator is run in parallel to the model, for efficiency.
         For instance, this allows you to do real-time data augmentation
diff --git a/keras/engine/training_arrays.py b/keras/engine/training_arrays.py
index e74096261b9..80fcdeaff99 100644
--- a/keras/engine/training_arrays.py
+++ b/keras/engine/training_arrays.py
@@ -18,40 +18,41 @@
 from ..utils.generic_utils import unpack_singleton
 
 
-def fit_loop(model, f, ins,
+def fit_loop(model, fit_function, fit_inputs,
              out_labels=None,
              batch_size=None,
              epochs=100,
              verbose=1,
              callbacks=None,
-             val_f=None,
-             val_ins=None,
+             val_function=None,
+             val_inputs=None,
              shuffle=True,
              callback_metrics=None,
              initial_epoch=0,
              steps_per_epoch=None,
              validation_steps=None):
-    """Abstract fit function for `f(ins)`.
+    """Abstract fit function for `fit_function(fit_inputs)`.
 
-    Assumes that f returns a list, labeled by out_labels.
+    Assumes that fit_function returns a list, labeled by out_labels.
 
     # Arguments
         model: Keras model instance.
-        f: Keras function returning a list of tensors
-        ins: List of tensors to be fed to `f`
+        fit_function: Keras function returning a list of tensors
+        fit_inputs: List of tensors to be fed to `fit_function`
         out_labels: List of strings, display names of
-            the outputs of `f`
+            the outputs of `fit_function`
         batch_size: Integer batch size or None if unknown.
         epochs: Number of times to iterate over the data
         verbose: Verbosity mode, 0, 1 or 2
         callbacks: List of callbacks to be called during training
-        val_f: Keras function to call for validation
-        val_ins: List of tensors to be fed to `val_f`
+        val_function: Keras function to call for validation
+        val_inputs: List of tensors to be fed to `val_function`
         shuffle: Whether to shuffle the data at the beginning of each epoch
         callback_metrics: List of strings, the display names of the metrics
             passed to the callbacks. They should be the
             concatenation of list the display names of the outputs of
-             `f` and the list of display names of the outputs of `f_val`.
+             `fit_function` and the list of display names
+             of the outputs of `fit_inputs`.
         initial_epoch: Epoch at which to start training
             (useful for resuming a previous training run)
         steps_per_epoch: Total number of steps (batches of samples)
@@ -65,12 +66,12 @@ def fit_loop(model, f, ins,
         `History` object.
     """
     do_validation = False
-    if val_f and val_ins:
+    if val_function and val_inputs:
         do_validation = True
-        if (verbose and ins and
-           hasattr(ins[0], 'shape') and hasattr(val_ins[0], 'shape')):
+        if (verbose and fit_inputs and
+           hasattr(fit_inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
             print('Train on %d samples, validate on %d samples' %
-                  (ins[0].shape[0], val_ins[0].shape[0]))
+                  (fit_inputs[0].shape[0], val_inputs[0].shape[0]))
     if validation_steps:
         do_validation = True
         if steps_per_epoch is None:
@@ -84,7 +85,7 @@ def fit_loop(model, f, ins,
                              'to perform validation '
                              'when doing step-wise training.')
 
-    num_train_samples = check_num_samples(ins,
+    num_train_samples = check_num_samples(fit_inputs,
                                           batch_size=batch_size,
                                           steps=steps_per_epoch,
                                           steps_name='steps_per_epoch')
@@ -127,7 +128,7 @@ def fit_loop(model, f, ins,
     callbacks.on_train_begin()
     callback_model.stop_training = False
     for cbk in callbacks:
-        cbk.validation_data = val_ins
+        cbk.validation_data = val_inputs
 
     # To prevent a slowdown,
     # we find beforehand the arrays that need conversion.
@@ -136,7 +137,7 @@ def fit_loop(model, f, ins,
             model._feed_sample_weights)
     indices_for_conversion_to_dense = []
     for i in range(len(feed)):
-        if issparse(ins[i]) and not K.is_sparse(feed[i]):
+        if issparse(fit_inputs[i]) and not K.is_sparse(feed[i]):
             indices_for_conversion_to_dense.append(i)
 
     for epoch in range(initial_epoch, epochs):
@@ -151,7 +152,7 @@ def fit_loop(model, f, ins,
                 batch_logs['batch'] = step_index
                 batch_logs['size'] = 1
                 callbacks.on_batch_begin(step_index, batch_logs)
-                outs = f(ins)
+                outs = fit_function(fit_inputs)
 
                 outs = to_list(outs)
                 for l, o in zip(out_labels, outs):
@@ -162,7 +163,7 @@ def fit_loop(model, f, ins,
                     break
 
             if do_validation:
-                val_outs = test_loop(model, val_f, val_ins,
+                val_outs = test_loop(model, val_function, val_inputs,
                                      steps=validation_steps,
                                      verbose=0)
                 val_outs = to_list(val_outs)
@@ -179,12 +180,12 @@ def fit_loop(model, f, ins,
             for batch_index, (batch_start, batch_end) in enumerate(batches):
                 batch_ids = index_array[batch_start:batch_end]
                 try:
-                    if isinstance(ins[-1], float):
+                    if isinstance(fit_inputs[-1], float):
                         # Do not slice the training phase flag.
                         ins_batch = slice_arrays(
-                            ins[:-1], batch_ids) + [ins[-1]]
+                            fit_inputs[:-1], batch_ids) + [fit_inputs[-1]]
                     else:
-                        ins_batch = slice_arrays(ins, batch_ids)
+                        ins_batch = slice_arrays(fit_inputs, batch_ids)
                 except TypeError:
                     raise TypeError('TypeError while preparing batch. '
                                     'If using HDF5 input data, '
@@ -196,7 +197,7 @@ def fit_loop(model, f, ins,
                 for i in indices_for_conversion_to_dense:
                     ins_batch[i] = ins_batch[i].toarray()
 
-                outs = f(ins_batch)
+                outs = fit_function(ins_batch)
                 outs = to_list(outs)
                 for l, o in zip(out_labels, outs):
                     batch_logs[l] = o
@@ -207,7 +208,7 @@ def fit_loop(model, f, ins,
 
                 if batch_index == len(batches) - 1:  # Last batch.
                     if do_validation:
-                        val_outs = test_loop(model, val_f, val_ins,
+                        val_outs = test_loop(model, val_function, val_inputs,
                                              batch_size=batch_size,
                                              verbose=0)
                         val_outs = to_list(val_outs)
diff --git a/keras/engine/training_generator.py b/keras/engine/training_generator.py
index 535f34e0909..3c480f226e3 100644
--- a/keras/engine/training_generator.py
+++ b/keras/engine/training_generator.py
@@ -7,6 +7,7 @@
 import warnings
 import numpy as np
 
+from .training_utils import iter_sequence_infinite
 from .. import backend as K
 from ..utils.data_utils import Sequence
 from ..utils.data_utils import GeneratorEnqueuer
@@ -109,19 +110,22 @@ def fit_generator(model,
                 # Create an Enqueuer that can be reused
                 val_data = validation_data
                 if isinstance(val_data, Sequence):
-                    val_enqueuer = OrderedEnqueuer(val_data,
-                                                   use_multiprocessing=use_multiprocessing)
-                    validation_steps = len(val_data)
+                    val_enqueuer = OrderedEnqueuer(
+                        val_data,
+                        use_multiprocessing=use_multiprocessing)
+                    validation_steps = validation_steps or len(val_data)
                 else:
-                    val_enqueuer = GeneratorEnqueuer(val_data,
-                                                     use_multiprocessing=use_multiprocessing)
+                    val_enqueuer = GeneratorEnqueuer(
+                        val_data,
+                        use_multiprocessing=use_multiprocessing)
                 val_enqueuer.start(workers=workers,
                                    max_queue_size=max_queue_size)
                 val_enqueuer_gen = val_enqueuer.get()
             elif val_gen:
                 val_data = validation_data
                 if isinstance(val_data, Sequence):
-                    val_enqueuer_gen = iter(val_data)
+                    val_enqueuer_gen = iter_sequence_infinite(val_data)
+                    validation_steps = validation_steps or len(val_data)
                 else:
                     val_enqueuer_gen = val_data
             else:
@@ -160,7 +164,7 @@ def fit_generator(model,
             output_generator = enqueuer.get()
         else:
             if is_sequence:
-                output_generator = iter(generator)
+                output_generator = iter_sequence_infinite(generator)
             else:
                 output_generator = generator
 
@@ -315,7 +319,7 @@ def evaluate_generator(model, generator,
             output_generator = enqueuer.get()
         else:
             if is_sequence:
-                output_generator = iter(generator)
+                output_generator = iter_sequence_infinite(generator)
             else:
                 output_generator = generator
 
@@ -420,7 +424,7 @@ def predict_generator(model, generator,
             output_generator = enqueuer.get()
         else:
             if is_sequence:
-                output_generator = iter(generator)
+                output_generator = iter_sequence_infinite(generator)
             else:
                 output_generator = generator
 
diff --git a/keras/engine/training_utils.py b/keras/engine/training_utils.py
index 1aba3387a9e..a1253b4872a 100644
--- a/keras/engine/training_utils.py
+++ b/keras/engine/training_utils.py
@@ -6,6 +6,7 @@
 
 import copy
 import numpy as np
+import warnings
 
 from .. import backend as K
 from .. import losses
@@ -484,6 +485,10 @@ def standardize_weights(y,
                              'sample-wise weights, make sure your '
                              'sample_weight array is 1D.')
 
+    if sample_weight is not None and class_weight is not None:
+        warnings.warn('Found both `sample_weight` and `class_weight`: '
+                      '`class_weight` argument will be ignored.')
+
     if sample_weight is not None:
         if len(sample_weight.shape) > len(y.shape):
             raise ValueError('Found a sample_weight with shape' +
@@ -578,3 +583,17 @@ def check_num_samples(ins,
     if hasattr(ins[0], 'shape'):
         return int(ins[0].shape[0])
     return None  # Edge case where ins == [static_learning_phase]
+
+
+def iter_sequence_infinite(seq):
+    """Iterate indefinitely over a Sequence.
+
+    # Arguments
+        seq: Sequence object
+
+    # Returns
+        Generator yielding batches.
+    """
+    while True:
+        for item in seq:
+            yield item
diff --git a/keras/initializers.py b/keras/initializers.py
index 18fb74a8fb0..ca984724df6 100644
--- a/keras/initializers.py
+++ b/keras/initializers.py
@@ -266,7 +266,9 @@ def get_config(self):
 class Identity(Initializer):
     """Initializer that generates the identity matrix.
 
-    Only use for square 2D matrices.
+    Only use for 2D matrices.
+    If the long side of the matrix is a multiple of the short side,
+    multiple identity matrices are concatenated along the long side.
 
     # Arguments
         gain: Multiplicative factor to apply to the identity matrix.
@@ -276,11 +278,21 @@ def __init__(self, gain=1.):
         self.gain = gain
 
     def __call__(self, shape, dtype=None):
-        if len(shape) != 2 or shape[0] != shape[1]:
-            raise ValueError('Identity matrix initializer can only be used '
-                             'for 2D square matrices.')
-        else:
+        if len(shape) != 2:
+            raise ValueError(
+                'Identity matrix initializer can only be used for 2D matrices.')
+
+        if max(shape) % min(shape) != 0:
+            raise ValueError('Long side should be multiple of short side.')
+
+        if shape[0] == shape[1]:
             return self.gain * np.identity(shape[0])
+        elif shape[0] > shape[1]:
+            return self.gain * np.concatenate(
+                [np.identity(shape[1])] * (shape[0] // shape[1]), axis=0)
+        else:
+            return self.gain * np.concatenate(
+                [np.identity(shape[0])] * (shape[1] // shape[0]), axis=1)
 
     def get_config(self):
         return {
diff --git a/keras/layers/advanced_activations.py b/keras/layers/advanced_activations.py
index 76394440395..ce1d2998413 100644
--- a/keras/layers/advanced_activations.py
+++ b/keras/layers/advanced_activations.py
@@ -13,6 +13,7 @@
 from ..engine.base_layer import InputSpec
 from .. import backend as K
 from ..legacy import interfaces
+from ..utils.generic_utils import to_list
 
 
 class LeakyReLU(Layer):
@@ -34,7 +35,8 @@ class LeakyReLU(Layer):
         alpha: float >= 0. Negative slope coefficient.
 
     # References
-        - [Rectifier Nonlinearities Improve Neural Network Acoustic Models](https://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf)
+        - [Rectifier Nonlinearities Improve Neural Network Acoustic Models]
+          (https://ai.stanford.edu/~amaas/papers/relu_hybrid_icml2013_final.pdf)
     """
 
     def __init__(self, alpha=0.3, **kwargs):
@@ -84,7 +86,8 @@ class PReLU(Layer):
             set `shared_axes=[1, 2]`.
 
     # References
-        - [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852)
+        - [Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+           ImageNet Classification](https://arxiv.org/abs/1502.01852)
     """
 
     @interfaces.legacy_prelu_support
@@ -100,10 +103,8 @@ def __init__(self, alpha_initializer='zeros',
         self.alpha_constraint = constraints.get(alpha_constraint)
         if shared_axes is None:
             self.shared_axes = None
-        elif not isinstance(shared_axes, (list, tuple)):
-            self.shared_axes = [shared_axes]
         else:
-            self.shared_axes = list(shared_axes)
+            self.shared_axes = to_list(shared_axes, allow_tuple=True)
 
     def build(self, input_shape):
         param_shape = list(input_shape[1:])
@@ -168,7 +169,8 @@ class ELU(Layer):
         alpha: scale for the negative factor.
 
     # References
-        - [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)](https://arxiv.org/abs/1511.07289v1)
+        - [Fast and Accurate Deep Network Learning by Exponential Linear Units
+           (ELUs)](https://arxiv.org/abs/1511.07289v1)
     """
 
     def __init__(self, alpha=1.0, **kwargs):
@@ -207,7 +209,8 @@ class ThresholdedReLU(Layer):
         theta: float >= 0. Threshold location of activation.
 
     # References
-        - [Zero-Bias Autoencoders and the Benefits of Co-Adapting Features](http://arxiv.org/abs/1402.3337)
+        - [Zero-Bias Autoencoders and the Benefits of Co-Adapting Features]
+          (https://arxiv.org/abs/1402.3337)
     """
 
     def __init__(self, theta=1.0, **kwargs):
@@ -262,6 +265,13 @@ def compute_output_shape(self, input_shape):
 class ReLU(Layer):
     """Rectified Linear Unit activation function.
 
+    With default values, it returns element-wise `max(x, 0)`.
+
+    Otherwise, it follows:
+    `f(x) = max_value` for `x >= max_value`,
+    `f(x) = x` for `threshold <= x < max_value`,
+    `f(x) = negative_slope * (x - threshold)` otherwise.
+
     # Input shape
         Arbitrary. Use the keyword argument `input_shape`
         (tuple of integers, does not include the samples axis)
@@ -271,19 +281,39 @@ class ReLU(Layer):
         Same shape as the input.
 
     # Arguments
-        max_value: Float, the maximum output value.
+        max_value: float >= 0. Maximum activation value.
+        negative_slope: float >= 0. Negative slope coefficient.
+        threshold: float. Threshold value for thresholded activation.
     """
 
-    def __init__(self, max_value=None, **kwargs):
+    def __init__(self, max_value=None, negative_slope=0.,
+                 threshold=0., **kwargs):
         super(ReLU, self).__init__(**kwargs)
+        if max_value is not None and max_value < 0.:
+            raise ValueError('max_value of ReLU layer '
+                             'cannot be negative value: %s' % str(max_value))
+        if negative_slope < 0.:
+            raise ValueError('negative_slope of ReLU layer cannot be '
+                             'negative value: %s' % str(negative_slope))
         self.supports_masking = True
+        if max_value is not None:
+            max_value = K.cast_to_floatx(max_value)
         self.max_value = max_value
+        self.negative_slope = K.cast_to_floatx(negative_slope)
+        self.threshold = K.cast_to_floatx(threshold)
 
     def call(self, inputs):
-        return activations.relu(inputs, max_value=self.max_value)
+        return K.relu(inputs,
+                      alpha=self.negative_slope,
+                      max_value=self.max_value,
+                      threshold=self.threshold)
 
     def get_config(self):
-        config = {'max_value': self.max_value}
+        config = {
+            'max_value': self.max_value,
+            'negative_slope': self.negative_slope,
+            'threshold': self.threshold
+        }
         base_config = super(ReLU, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
diff --git a/keras/layers/convolutional.py b/keras/layers/convolutional.py
index 7db563f590b..0cba3cf849a 100644
--- a/keras/layers/convolutional.py
+++ b/keras/layers/convolutional.py
@@ -105,11 +105,13 @@ def __init__(self, rank,
         super(_Conv, self).__init__(**kwargs)
         self.rank = rank
         self.filters = filters
-        self.kernel_size = conv_utils.normalize_tuple(kernel_size, rank, 'kernel_size')
+        self.kernel_size = conv_utils.normalize_tuple(kernel_size, rank,
+                                                      'kernel_size')
         self.strides = conv_utils.normalize_tuple(strides, rank, 'strides')
         self.padding = conv_utils.normalize_padding(padding)
         self.data_format = K.normalize_data_format(data_format)
-        self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, rank, 'dilation_rate')
+        self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, rank,
+                                                        'dilation_rate')
         self.activation = activations.get(activation)
         self.use_bias = use_bias
         self.kernel_initializer = initializers.get(kernel_initializer)
@@ -231,7 +233,8 @@ def get_config(self):
             'bias_initializer': initializers.serialize(self.bias_initializer),
             'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
             'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+            'activity_regularizer':
+                regularizers.serialize(self.activity_regularizer),
             'kernel_constraint': constraints.serialize(self.kernel_constraint),
             'bias_constraint': constraints.serialize(self.bias_constraint)
         }
@@ -268,10 +271,14 @@ class Conv1D(_Conv):
             `"valid"` means "no padding".
             `"same"` results in padding the input such that
             the output has the same length as the original input.
-            `"causal"` results in causal (dilated) convolutions, e.g. output[t]
-            does not depend on input[t+1:]. Useful when modeling temporal data
-            where the model should not violate the temporal order.
-            See [WaveNet: A Generative Model for Raw Audio, section 2.1](https://arxiv.org/abs/1609.03499).
+            `"causal"` results in causal (dilated) convolutions,
+            e.g. `output[t]` does not depend on `input[t + 1:]`.
+            A zero padding is used such that
+            the output has the same length as the original input.
+            Useful when modeling temporal data where the model
+            should not violate the temporal order. See
+            [WaveNet: A Generative Model for Raw Audio, section 2.1]
+            (https://arxiv.org/abs/1609.03499).
         data_format: A string,
             one of `"channels_last"` (default) or `"channels_first"`.
             The ordering of the dimensions in the inputs.
@@ -578,7 +585,8 @@ class Conv3D(_Conv):
         or 5D tensor with shape:
         `(batch, new_conv_dim1, new_conv_dim2, new_conv_dim3, filters)`
         if `data_format` is `"channels_last"`.
-        `new_conv_dim1`, `new_conv_dim2` and `new_conv_dim3` values might have changed due to padding.
+        `new_conv_dim1`, `new_conv_dim2` and `new_conv_dim3` values might have
+        changed due to padding.
     """
 
     @interfaces.legacy_conv3d_support
@@ -719,13 +727,17 @@ class Conv2DTranspose(Conv2D):
         If `output_padding` is specified:
 
         ```
-        new_rows = (rows - 1) * strides[0] + kernel_size[0] - 2 * padding[0] + output_padding[0]
-        new_cols = (cols - 1) * strides[1] + kernel_size[1] - 2 * padding[1] + output_padding[1]
+        new_rows = ((rows - 1) * strides[0] + kernel_size[0]
+                    - 2 * padding[0] + output_padding[0])
+        new_cols = ((cols - 1) * strides[1] + kernel_size[1]
+                    - 2 * padding[1] + output_padding[1])
         ```
 
     # References
-        - [A guide to convolution arithmetic for deep learning](https://arxiv.org/abs/1603.07285v1)
-        - [Deconvolutional Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+        - [A guide to convolution arithmetic for deep learning]
+          (https://arxiv.org/abs/1603.07285v1)
+        - [Deconvolutional Networks]
+          (http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
     """
 
     @interfaces.legacy_deconv2d_support
@@ -735,6 +747,7 @@ def __init__(self, filters,
                  padding='valid',
                  output_padding=None,
                  data_format=None,
+                 dilation_rate=(1, 1),
                  activation=None,
                  use_bias=True,
                  kernel_initializer='glorot_uniform',
@@ -751,6 +764,7 @@ def __init__(self, filters,
             strides=strides,
             padding=padding,
             data_format=data_format,
+            dilation_rate=dilation_rate,
             activation=activation,
             use_bias=use_bias,
             kernel_initializer=kernel_initializer,
@@ -828,11 +842,13 @@ def call(self, inputs):
         out_height = conv_utils.deconv_length(height,
                                               stride_h, kernel_h,
                                               self.padding,
-                                              out_pad_h)
+                                              out_pad_h,
+                                              self.dilation_rate[0])
         out_width = conv_utils.deconv_length(width,
                                              stride_w, kernel_w,
                                              self.padding,
-                                             out_pad_w)
+                                             out_pad_w,
+                                             self.dilation_rate[1])
         if self.data_format == 'channels_first':
             output_shape = (batch_size, self.filters, out_height, out_width)
         else:
@@ -844,7 +860,8 @@ def call(self, inputs):
             output_shape,
             self.strides,
             padding=self.padding,
-            data_format=self.data_format)
+            data_format=self.data_format,
+            dilation_rate=self.dilation_rate)
 
         if self.use_bias:
             outputs = K.bias_add(
@@ -875,17 +892,18 @@ def compute_output_shape(self, input_shape):
                                                         stride_h,
                                                         kernel_h,
                                                         self.padding,
-                                                        out_pad_h)
+                                                        out_pad_h,
+                                                        self.dilation_rate[0])
         output_shape[w_axis] = conv_utils.deconv_length(output_shape[w_axis],
                                                         stride_w,
                                                         kernel_w,
                                                         self.padding,
-                                                        out_pad_w)
+                                                        out_pad_w,
+                                                        self.dilation_rate[1])
         return tuple(output_shape)
 
     def get_config(self):
         config = super(Conv2DTranspose, self).get_config()
-        config.pop('dilation_rate')
         config['output_padding'] = self.output_padding
         return config
 
@@ -986,14 +1004,19 @@ class Conv3DTranspose(Conv3D):
         If `output_padding` is specified::
 
         ```
-        new_depth = (depth - 1) * strides[0] + kernel_size[0] - 2 * padding[0] + output_padding[0]
-        new_rows = (rows - 1) * strides[1] + kernel_size[1] - 2 * padding[1] + output_padding[1]
-        new_cols = (cols - 1) * strides[2] + kernel_size[2] - 2 * padding[2] + output_padding[2]
+        new_depth = ((depth - 1) * strides[0] + kernel_size[0]
+                     - 2 * padding[0] + output_padding[0])
+        new_rows = ((rows - 1) * strides[1] + kernel_size[1]
+                    - 2 * padding[1] + output_padding[1])
+        new_cols = ((cols - 1) * strides[2] + kernel_size[2]
+                    - 2 * padding[2] + output_padding[2])
         ```
 
     # References
-        - [A guide to convolution arithmetic for deep learning](https://arxiv.org/abs/1603.07285v1)
-        - [Deconvolutional Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+        - [A guide to convolution arithmetic for deep learning]
+          (https://arxiv.org/abs/1603.07285v1)
+        - [Deconvolutional Networks]
+          (http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
     """
 
     def __init__(self, filters,
@@ -1108,9 +1131,11 @@ def call(self, inputs):
                                              out_pad_w)
 
         if self.data_format == 'channels_first':
-            output_shape = (batch_size, self.filters, out_depth, out_height, out_width)
+            output_shape = (batch_size, self.filters,
+                            out_depth, out_height, out_width)
         else:
-            output_shape = (batch_size, out_depth, out_height, out_width, self.filters)
+            output_shape = (batch_size, out_depth,
+                            out_height, out_width, self.filters)
 
         outputs = K.conv3d_transpose(inputs,
                                      self.kernel,
@@ -1393,12 +1418,18 @@ def get_config(self):
         config.pop('kernel_regularizer')
         config.pop('kernel_constraint')
         config['depth_multiplier'] = self.depth_multiplier
-        config['depthwise_initializer'] = initializers.serialize(self.depthwise_initializer)
-        config['pointwise_initializer'] = initializers.serialize(self.pointwise_initializer)
-        config['depthwise_regularizer'] = regularizers.serialize(self.depthwise_regularizer)
-        config['pointwise_regularizer'] = regularizers.serialize(self.pointwise_regularizer)
-        config['depthwise_constraint'] = constraints.serialize(self.depthwise_constraint)
-        config['pointwise_constraint'] = constraints.serialize(self.pointwise_constraint)
+        config['depthwise_initializer'] = (
+            initializers.serialize(self.depthwise_initializer))
+        config['pointwise_initializer'] = (
+            initializers.serialize(self.pointwise_initializer))
+        config['depthwise_regularizer'] = (
+            regularizers.serialize(self.depthwise_regularizer))
+        config['pointwise_regularizer'] = (
+            regularizers.serialize(self.pointwise_regularizer))
+        config['depthwise_constraint'] = (
+            constraints.serialize(self.depthwise_constraint))
+        config['pointwise_constraint'] = (
+            constraints.serialize(self.pointwise_constraint))
         return config
 
 
@@ -1433,9 +1464,6 @@ class SeparableConv1D(_SeparableConv):
             `(batch, steps, channels)` while `"channels_first"`
             corresponds to inputs with shape
             `(batch, channels, steps)`.
-            It defaults to the `image_data_format` value found in your
-            Keras config file at `~/.keras/keras.json`.
-            If you never set it, then it will be "channels_last".
         dilation_rate: An integer or tuple/list of a single integer, specifying
             the dilation rate to use for dilated convolution.
             Currently, specifying any `dilation_rate` value != 1 is
@@ -1497,7 +1525,7 @@ def __init__(self, filters,
                  kernel_size,
                  strides=1,
                  padding='valid',
-                 data_format=None,
+                 data_format='channels_last',
                  dilation_rate=1,
                  depth_multiplier=1,
                  activation=None,
@@ -1879,13 +1907,61 @@ def get_config(self):
         config.pop('kernel_regularizer')
         config.pop('kernel_constraint')
         config['depth_multiplier'] = self.depth_multiplier
-        config['depthwise_initializer'] = initializers.serialize(self.depthwise_initializer)
-        config['depthwise_regularizer'] = regularizers.serialize(self.depthwise_regularizer)
-        config['depthwise_constraint'] = constraints.serialize(self.depthwise_constraint)
+        config['depthwise_initializer'] = (
+            initializers.serialize(self.depthwise_initializer))
+        config['depthwise_regularizer'] = (
+            regularizers.serialize(self.depthwise_regularizer))
+        config['depthwise_constraint'] = (
+            constraints.serialize(self.depthwise_constraint))
         return config
 
 
-class UpSampling1D(Layer):
+class _UpSampling(Layer):
+    """Abstract nD UpSampling layer (private, used as implementation base).
+
+    # Arguments
+        size: Tuple of ints.
+        data_format: A string,
+            one of `"channels_last"` or `"channels_first"`.
+            The ordering of the dimensions in the inputs.
+            `"channels_last"` corresponds to inputs with shape
+            `(batch, ..., channels)` while `"channels_first"` corresponds to
+            inputs with shape `(batch, channels, ...)`.
+            It defaults to the `image_data_format` value found in your
+            Keras config file at `~/.keras/keras.json`.
+            If you never set it, then it will be "channels_last".
+    """
+    def __init__(self, size, data_format=None, **kwargs):
+        # self.rank is 1 for UpSampling1D, 2 for UpSampling2D.
+        self.rank = len(size)
+        self.size = size
+        self.data_format = K.normalize_data_format(data_format)
+        self.input_spec = InputSpec(ndim=self.rank + 2)
+        super(_UpSampling, self).__init__(**kwargs)
+
+    def call(self, inputs):
+        raise NotImplementedError
+
+    def compute_output_shape(self, input_shape):
+        size_all_dims = (1,) + self.size + (1,)
+        spatial_axes = list(range(1, 1 + self.rank))
+        size_all_dims = transpose_shape(size_all_dims,
+                                        self.data_format,
+                                        spatial_axes)
+        output_shape = list(input_shape)
+        for dim in range(len(output_shape)):
+            if output_shape[dim] is not None:
+                output_shape[dim] *= size_all_dims[dim]
+        return tuple(output_shape)
+
+    def get_config(self):
+        config = {'size': self.size,
+                  'data_format': self.data_format}
+        base_config = super(_UpSampling, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class UpSampling1D(_UpSampling):
     """Upsampling layer for 1D inputs.
 
     Repeats each temporal step `size` times along the time axis.
@@ -1902,25 +1978,20 @@ class UpSampling1D(Layer):
 
     @interfaces.legacy_upsampling1d_support
     def __init__(self, size=2, **kwargs):
-        super(UpSampling1D, self).__init__(**kwargs)
-        self.size = int(size)
-        self.input_spec = InputSpec(ndim=3)
-
-    def compute_output_shape(self, input_shape):
-        size = self.size * input_shape[1] if input_shape[1] is not None else None
-        return (input_shape[0], size, input_shape[2])
+        super(UpSampling1D, self).__init__((int(size),), 'channels_last', **kwargs)
 
     def call(self, inputs):
-        output = K.repeat_elements(inputs, self.size, axis=1)
+        output = K.repeat_elements(inputs, self.size[0], axis=1)
         return output
 
     def get_config(self):
-        config = {'size': self.size}
-        base_config = super(UpSampling1D, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
+        config = super(UpSampling1D, self).get_config()
+        config['size'] = self.size[0]
+        config.pop('data_format')
+        return config
 
 
-class UpSampling2D(Layer):
+class UpSampling2D(_UpSampling):
     """Upsampling layer for 2D inputs.
 
     Repeats the rows and columns of the data
@@ -1939,6 +2010,9 @@ class UpSampling2D(Layer):
             It defaults to the `image_data_format` value found in your
             Keras config file at `~/.keras/keras.json`.
             If you never set it, then it will be "channels_last".
+        interpolation: A string, one of `nearest` or `bilinear`.
+            Note that CNTK does not support yet the `bilinear` upscaling
+            and that with Theano, only `size=(2, 2)` is possible.
 
     # Input shape
         4D tensor with shape:
@@ -1956,40 +2030,26 @@ class UpSampling2D(Layer):
     """
 
     @interfaces.legacy_upsampling2d_support
-    def __init__(self, size=(2, 2), data_format=None, **kwargs):
-        super(UpSampling2D, self).__init__(**kwargs)
-        self.data_format = K.normalize_data_format(data_format)
-        self.size = conv_utils.normalize_tuple(size, 2, 'size')
-        self.input_spec = InputSpec(ndim=4)
-
-    def compute_output_shape(self, input_shape):
-        if self.data_format == 'channels_first':
-            height = self.size[0] * input_shape[2] if input_shape[2] is not None else None
-            width = self.size[1] * input_shape[3] if input_shape[3] is not None else None
-            return (input_shape[0],
-                    input_shape[1],
-                    height,
-                    width)
-        elif self.data_format == 'channels_last':
-            height = self.size[0] * input_shape[1] if input_shape[1] is not None else None
-            width = self.size[1] * input_shape[2] if input_shape[2] is not None else None
-            return (input_shape[0],
-                    height,
-                    width,
-                    input_shape[3])
+    def __init__(self, size=(2, 2), data_format=None, interpolation='nearest',
+                 **kwargs):
+        normalized_size = conv_utils.normalize_tuple(size, 2, 'size')
+        super(UpSampling2D, self).__init__(normalized_size, data_format, **kwargs)
+        if interpolation not in ['nearest', 'bilinear']:
+            raise ValueError('interpolation should be one '
+                             'of "nearest" or "bilinear".')
+        self.interpolation = interpolation
 
     def call(self, inputs):
         return K.resize_images(inputs, self.size[0], self.size[1],
-                               self.data_format)
+                               self.data_format, self.interpolation)
 
     def get_config(self):
-        config = {'size': self.size,
-                  'data_format': self.data_format}
-        base_config = super(UpSampling2D, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
+        config = super(UpSampling2D, self).get_config()
+        config['interpolation'] = self.interpolation
+        return config
 
 
-class UpSampling3D(Layer):
+class UpSampling3D(_UpSampling):
     """Upsampling layer for 3D inputs.
 
     Repeats the 1st, 2nd and 3rd dimensions
@@ -2026,44 +2086,62 @@ class UpSampling3D(Layer):
 
     @interfaces.legacy_upsampling3d_support
     def __init__(self, size=(2, 2, 2), data_format=None, **kwargs):
-        self.data_format = K.normalize_data_format(data_format)
-        self.size = conv_utils.normalize_tuple(size, 3, 'size')
-        self.input_spec = InputSpec(ndim=5)
-        super(UpSampling3D, self).__init__(**kwargs)
-
-    def compute_output_shape(self, input_shape):
-        if self.data_format == 'channels_first':
-            dim1 = self.size[0] * input_shape[2] if input_shape[2] is not None else None
-            dim2 = self.size[1] * input_shape[3] if input_shape[3] is not None else None
-            dim3 = self.size[2] * input_shape[4] if input_shape[4] is not None else None
-            return (input_shape[0],
-                    input_shape[1],
-                    dim1,
-                    dim2,
-                    dim3)
-        elif self.data_format == 'channels_last':
-            dim1 = self.size[0] * input_shape[1] if input_shape[1] is not None else None
-            dim2 = self.size[1] * input_shape[2] if input_shape[2] is not None else None
-            dim3 = self.size[2] * input_shape[3] if input_shape[3] is not None else None
-            return (input_shape[0],
-                    dim1,
-                    dim2,
-                    dim3,
-                    input_shape[4])
+        normalized_size = conv_utils.normalize_tuple(size, 3, 'size')
+        super(UpSampling3D, self).__init__(normalized_size, data_format, **kwargs)
 
     def call(self, inputs):
         return K.resize_volumes(inputs,
                                 self.size[0], self.size[1], self.size[2],
                                 self.data_format)
 
+
+class _ZeroPadding(Layer):
+    """Abstract nD ZeroPadding layer (private, used as implementation base).
+
+    # Arguments
+        padding: Tuple of tuples of two ints. Can be a tuple of ints when
+            rank is 1.
+        data_format: A string,
+            one of `"channels_last"` or `"channels_first"`.
+            The ordering of the dimensions in the inputs.
+            `"channels_last"` corresponds to inputs with shape
+            `(batch, ..., channels)` while `"channels_first"` corresponds to
+            inputs with shape `(batch, channels, ...)`.
+            It defaults to the `image_data_format` value found in your
+            Keras config file at `~/.keras/keras.json`.
+            If you never set it, then it will be "channels_last".
+    """
+    def __init__(self, padding, data_format=None, **kwargs):
+        # self.rank is 1 for ZeroPadding1D, 2 for ZeroPadding2D.
+        self.rank = len(padding)
+        self.padding = padding
+        self.data_format = K.normalize_data_format(data_format)
+        self.input_spec = InputSpec(ndim=self.rank + 2)
+        super(_ZeroPadding, self).__init__(**kwargs)
+
+    def call(self, inputs):
+        raise NotImplementedError
+
+    def compute_output_shape(self, input_shape):
+        padding_all_dims = ((0, 0),) + self.padding + ((0, 0),)
+        spatial_axes = list(range(1, 1 + self.rank))
+        padding_all_dims = transpose_shape(padding_all_dims,
+                                           self.data_format,
+                                           spatial_axes)
+        output_shape = list(input_shape)
+        for dim in range(len(output_shape)):
+            if output_shape[dim] is not None:
+                output_shape[dim] += sum(padding_all_dims[dim])
+        return tuple(output_shape)
+
     def get_config(self):
-        config = {'size': self.size,
+        config = {'padding': self.padding,
                   'data_format': self.data_format}
-        base_config = super(UpSampling3D, self).get_config()
+        base_config = super(_ZeroPadding, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
 
-class ZeroPadding1D(Layer):
+class ZeroPadding1D(_ZeroPadding):
     """Zero-padding layer for 1D input (e.g. temporal sequence).
 
     # Arguments
@@ -2083,29 +2161,22 @@ class ZeroPadding1D(Layer):
     """
 
     def __init__(self, padding=1, **kwargs):
-        super(ZeroPadding1D, self).__init__(**kwargs)
-        self.padding = conv_utils.normalize_tuple(padding, 2, 'padding')
-        self.input_spec = InputSpec(ndim=3)
-
-    def compute_output_shape(self, input_shape):
-        if input_shape[1] is not None:
-            length = input_shape[1] + self.padding[0] + self.padding[1]
-        else:
-            length = None
-        return (input_shape[0],
-                length,
-                input_shape[2])
+        normalized_padding = (conv_utils.normalize_tuple(padding, 2, 'padding'),)
+        super(ZeroPadding1D, self).__init__(normalized_padding,
+                                            'channels_last',
+                                            **kwargs)
 
     def call(self, inputs):
-        return K.temporal_padding(inputs, padding=self.padding)
+        return K.temporal_padding(inputs, padding=self.padding[0])
 
     def get_config(self):
-        config = {'padding': self.padding}
-        base_config = super(ZeroPadding1D, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
+        config = super(ZeroPadding1D, self).get_config()
+        config['padding'] = config['padding'][0]
+        config.pop('data_format')
+        return config
 
 
-class ZeroPadding2D(Layer):
+class ZeroPadding2D(_ZeroPadding):
     """Zero-padding layer for 2D input (e.g. picture).
 
     This layer can add rows and columns of zeros
@@ -2153,10 +2224,8 @@ def __init__(self,
                  padding=(1, 1),
                  data_format=None,
                  **kwargs):
-        super(ZeroPadding2D, self).__init__(**kwargs)
-        self.data_format = K.normalize_data_format(data_format)
         if isinstance(padding, int):
-            self.padding = ((padding, padding), (padding, padding))
+            normalized_padding = ((padding, padding), (padding, padding))
         elif hasattr(padding, '__len__'):
             if len(padding) != 2:
                 raise ValueError('`padding` should have two elements. '
@@ -2165,7 +2234,7 @@ def __init__(self,
                                                         '1st entry of padding')
             width_padding = conv_utils.normalize_tuple(padding[1], 2,
                                                        '2nd entry of padding')
-            self.padding = (height_padding, width_padding)
+            normalized_padding = (height_padding, width_padding)
         else:
             raise ValueError('`padding` should be either an int, '
                              'a tuple of 2 ints '
@@ -2173,49 +2242,17 @@ def __init__(self,
                              'or a tuple of 2 tuples of 2 ints '
                              '((top_pad, bottom_pad), (left_pad, right_pad)). '
                              'Found: ' + str(padding))
-        self.input_spec = InputSpec(ndim=4)
-
-    def compute_output_shape(self, input_shape):
-        if self.data_format == 'channels_first':
-            if input_shape[2] is not None:
-                rows = input_shape[2] + self.padding[0][0] + self.padding[0][1]
-            else:
-                rows = None
-            if input_shape[3] is not None:
-                cols = input_shape[3] + self.padding[1][0] + self.padding[1][1]
-            else:
-                cols = None
-            return (input_shape[0],
-                    input_shape[1],
-                    rows,
-                    cols)
-        elif self.data_format == 'channels_last':
-            if input_shape[1] is not None:
-                rows = input_shape[1] + self.padding[0][0] + self.padding[0][1]
-            else:
-                rows = None
-            if input_shape[2] is not None:
-                cols = input_shape[2] + self.padding[1][0] + self.padding[1][1]
-            else:
-                cols = None
-            return (input_shape[0],
-                    rows,
-                    cols,
-                    input_shape[3])
+        super(ZeroPadding2D, self).__init__(normalized_padding,
+                                            data_format,
+                                            **kwargs)
 
     def call(self, inputs):
         return K.spatial_2d_padding(inputs,
                                     padding=self.padding,
                                     data_format=self.data_format)
 
-    def get_config(self):
-        config = {'padding': self.padding,
-                  'data_format': self.data_format}
-        base_config = super(ZeroPadding2D, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
 
-class ZeroPadding3D(Layer):
+class ZeroPadding3D(_ZeroPadding):
     """Zero-padding layer for 3D data (spatial or spatio-temporal).
 
     # Arguments
@@ -2228,7 +2265,9 @@ class ZeroPadding3D(Layer):
                 `(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad)`.
             - If tuple of 3 tuples of 2 ints:
                 interpreted as
-                `((left_dim1_pad, right_dim1_pad), (left_dim2_pad, right_dim2_pad), (left_dim3_pad, right_dim3_pad))`
+                `((left_dim1_pad, right_dim1_pad),
+                  (left_dim2_pad, right_dim2_pad),
+                  (left_dim3_pad, right_dim3_pad))`
         data_format: A string,
             one of `"channels_last"` or `"channels_first"`.
             The ordering of the dimensions in the inputs.
@@ -2243,24 +2282,26 @@ class ZeroPadding3D(Layer):
     # Input shape
         5D tensor with shape:
         - If `data_format` is `"channels_last"`:
-            `(batch, first_axis_to_pad, second_axis_to_pad, third_axis_to_pad, depth)`
+            `(batch, first_axis_to_pad, second_axis_to_pad, third_axis_to_pad,
+              depth)`
         - If `data_format` is `"channels_first"`:
-            `(batch, depth, first_axis_to_pad, second_axis_to_pad, third_axis_to_pad)`
+            `(batch, depth,
+              first_axis_to_pad, second_axis_to_pad, third_axis_to_pad)`
 
     # Output shape
         5D tensor with shape:
         - If `data_format` is `"channels_last"`:
-            `(batch, first_padded_axis, second_padded_axis, third_axis_to_pad, depth)`
+            `(batch, first_padded_axis, second_padded_axis, third_axis_to_pad,
+              depth)`
         - If `data_format` is `"channels_first"`:
-            `(batch, depth, first_padded_axis, second_padded_axis, third_axis_to_pad)`
+            `(batch, depth,
+              first_padded_axis, second_padded_axis, third_axis_to_pad)`
     """
 
     @interfaces.legacy_zeropadding3d_support
     def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
-        super(ZeroPadding3D, self).__init__(**kwargs)
-        self.data_format = K.normalize_data_format(data_format)
         if isinstance(padding, int):
-            self.padding = ((padding, padding), (padding, padding), (padding, padding))
+            normalized_padding = 3 * ((padding, padding),)
         elif hasattr(padding, '__len__'):
             if len(padding) != 3:
                 raise ValueError('`padding` should have 3 elements. '
@@ -2271,69 +2312,88 @@ def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
                                                       '2nd entry of padding')
             dim3_padding = conv_utils.normalize_tuple(padding[2], 2,
                                                       '3rd entry of padding')
-            self.padding = (dim1_padding, dim2_padding, dim3_padding)
+            normalized_padding = (dim1_padding, dim2_padding, dim3_padding)
         else:
-            raise ValueError('`padding` should be either an int, '
-                             'a tuple of 3 ints '
-                             '(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad), '
-                             'or a tuple of 3 tuples of 2 ints '
-                             '((left_dim1_pad, right_dim1_pad),'
-                             ' (left_dim2_pad, right_dim2_pad),'
-                             ' (left_dim3_pad, right_dim2_pad)). '
-                             'Found: ' + str(padding))
-        self.input_spec = InputSpec(ndim=5)
-
-    def compute_output_shape(self, input_shape):
-        if self.data_format == 'channels_first':
-            if input_shape[2] is not None:
-                dim1 = input_shape[2] + self.padding[0][0] + self.padding[0][1]
-            else:
-                dim1 = None
-            if input_shape[3] is not None:
-                dim2 = input_shape[3] + self.padding[1][0] + self.padding[1][1]
-            else:
-                dim2 = None
-            if input_shape[4] is not None:
-                dim3 = input_shape[4] + self.padding[2][0] + self.padding[2][1]
-            else:
-                dim3 = None
-            return (input_shape[0],
-                    input_shape[1],
-                    dim1,
-                    dim2,
-                    dim3)
-        elif self.data_format == 'channels_last':
-            if input_shape[1] is not None:
-                dim1 = input_shape[1] + self.padding[0][0] + self.padding[0][1]
-            else:
-                dim1 = None
-            if input_shape[2] is not None:
-                dim2 = input_shape[2] + self.padding[1][0] + self.padding[1][1]
-            else:
-                dim2 = None
-            if input_shape[3] is not None:
-                dim3 = input_shape[3] + self.padding[2][0] + self.padding[2][1]
-            else:
-                dim3 = None
-            return (input_shape[0],
-                    dim1,
-                    dim2,
-                    dim3,
-                    input_shape[4])
+            raise ValueError(
+                '`padding` should be either an int, a tuple of 3 ints '
+                '(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad), '
+                'or a tuple of 3 tuples of 2 ints '
+                '((left_dim1_pad, right_dim1_pad),'
+                ' (left_dim2_pad, right_dim2_pad),'
+                ' (left_dim3_pad, right_dim2_pad)). '
+                'Found: ' + str(padding))
+        super(ZeroPadding3D, self).__init__(normalized_padding,
+                                            data_format,
+                                            **kwargs)
 
     def call(self, inputs):
         return K.spatial_3d_padding(inputs,
                                     padding=self.padding,
                                     data_format=self.data_format)
 
+
+class _Cropping(Layer):
+    """Abstract nD copping layer (private, used as implementation base).
+
+    # Arguments
+        cropping: A tuple of tuples of 2 ints.
+        data_format: A string,
+            one of `"channels_last"` or `"channels_first"`.
+            The ordering of the dimensions in the inputs.
+            `"channels_last"` corresponds to inputs with shape
+            `(batch, ..., channels)` while `"channels_first"` corresponds to
+            inputs with shape `(batch, channels, ...)`.
+            It defaults to the `image_data_format` value found in your
+            Keras config file at `~/.keras/keras.json`.
+            If you never set it, then it will be "channels_last".
+            For Cropping1D, the data format is always `"channels_last"`.
+    """
+
+    def __init__(self, cropping,
+                 data_format=None,
+                 **kwargs):
+        super(_Cropping, self).__init__(**kwargs)
+        # self.rank is 1 for Cropping1D, 2 for Cropping2D...
+        self.rank = len(cropping)
+        self.cropping = cropping
+        self.data_format = K.normalize_data_format(data_format)
+        self.input_spec = InputSpec(ndim=2 + self.rank)
+
+    def call(self, inputs):
+        slices_dims = []
+        for start, end in self.cropping:
+            if end == 0:
+                end = None
+            else:
+                end = -end
+            slices_dims.append(slice(start, end))
+
+        slices = [slice(None)] + slices_dims + [slice(None)]
+        slices = tuple(slices)
+        spatial_axes = list(range(1, 1 + self.rank))
+        slices = transpose_shape(slices, self.data_format, spatial_axes)
+        return inputs[slices]
+
+    def compute_output_shape(self, input_shape):
+        cropping_all_dims = ((0, 0),) + self.cropping + ((0, 0),)
+        spatial_axes = list(range(1, 1 + self.rank))
+        cropping_all_dims = transpose_shape(cropping_all_dims,
+                                            self.data_format,
+                                            spatial_axes)
+        output_shape = list(input_shape)
+        for dim in range(len(output_shape)):
+            if output_shape[dim] is not None:
+                output_shape[dim] -= sum(cropping_all_dims[dim])
+        return tuple(output_shape)
+
     def get_config(self):
-        config = {'padding': self.padding,
+        config = {'cropping': self.cropping,
                   'data_format': self.data_format}
-        base_config = super(ZeroPadding3D, self).get_config()
+        base_config = super(_Cropping, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
 
-class Cropping1D(Layer):
+class Cropping1D(_Cropping):
     """Cropping layer for 1D input (e.g. temporal sequence).
 
     It crops along the time dimension (axis 1).
@@ -2353,25 +2413,19 @@ class Cropping1D(Layer):
     """
 
     def __init__(self, cropping=(1, 1), **kwargs):
-        super(Cropping1D, self).__init__(**kwargs)
-        self.cropping = conv_utils.normalize_tuple(cropping, 2, 'cropping')
-        self.input_spec = InputSpec(ndim=3)
-
-    def compute_output_shape(self, input_shape):
-        return _compute_output_shape_cropping(input_shape,
-                                              'channels_last',
-                                              (self.cropping,))
-
-    def call(self, inputs):
-        return _call_cropping(inputs, 'channels_last', (self.cropping,))
+        normalized_cropping = (conv_utils.normalize_tuple(cropping, 2, 'cropping'),)
+        super(Cropping1D, self).__init__(normalized_cropping,
+                                         'channels_last',
+                                         **kwargs)
 
     def get_config(self):
-        config = {'cropping': self.cropping}
         base_config = super(Cropping1D, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
+        base_config.pop('data_format')
+        base_config['cropping'] = base_config['cropping'][0]
+        return base_config
 
 
-class Cropping2D(Layer):
+class Cropping2D(_Cropping):
     """Cropping layer for 2D input (e.g. picture).
 
     It crops along spatial dimensions, i.e. height and width.
@@ -2429,10 +2483,8 @@ class Cropping2D(Layer):
     @interfaces.legacy_cropping2d_support
     def __init__(self, cropping=((0, 0), (0, 0)),
                  data_format=None, **kwargs):
-        super(Cropping2D, self).__init__(**kwargs)
-        self.data_format = K.normalize_data_format(data_format)
         if isinstance(cropping, int):
-            self.cropping = ((cropping, cropping), (cropping, cropping))
+            normalized_cropping = ((cropping, cropping), (cropping, cropping))
         elif hasattr(cropping, '__len__'):
             if len(cropping) != 2:
                 raise ValueError('`cropping` should have two elements. '
@@ -2443,7 +2495,7 @@ def __init__(self, cropping=((0, 0), (0, 0)),
             width_cropping = conv_utils.normalize_tuple(
                 cropping[1], 2,
                 '2nd entry of cropping')
-            self.cropping = (height_cropping, width_cropping)
+            normalized_cropping = (height_cropping, width_cropping)
         else:
             raise ValueError('`cropping` should be either an int, '
                              'a tuple of 2 ints '
@@ -2451,24 +2503,12 @@ def __init__(self, cropping=((0, 0), (0, 0)),
                              'or a tuple of 2 tuples of 2 ints '
                              '((top_crop, bottom_crop), (left_crop, right_crop)). '
                              'Found: ' + str(cropping))
-        self.input_spec = InputSpec(ndim=4)
+        super(Cropping2D, self).__init__(normalized_cropping,
+                                         data_format,
+                                         **kwargs)
 
-    def compute_output_shape(self, input_shape):
-        return _compute_output_shape_cropping(input_shape,
-                                              self.data_format,
-                                              self.cropping)
 
-    def call(self, inputs):
-        return _call_cropping(inputs, self.data_format, self.cropping)
-
-    def get_config(self):
-        config = {'cropping': self.cropping,
-                  'data_format': self.data_format}
-        base_config = super(Cropping2D, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-
-class Cropping3D(Layer):
+class Cropping3D(_Cropping):
     """Cropping layer for 3D data (e.g. spatial or spatio-temporal).
 
     # Arguments
@@ -2481,7 +2521,9 @@ class Cropping3D(Layer):
                 `(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop)`.
             - If tuple of 3 tuples of 2 ints:
                 interpreted as
-                `((left_dim1_crop, right_dim1_crop), (left_dim2_crop, right_dim2_crop), (left_dim3_crop, right_dim3_crop))`
+                `((left_dim1_crop, right_dim1_crop),
+                  (left_dim2_crop, right_dim2_crop),
+                  (left_dim3_crop, right_dim3_crop))`
         data_format: A string,
             one of `"channels_last"` or `"channels_first"`.
             The ordering of the dimensions in the inputs.
@@ -2496,27 +2538,30 @@ class Cropping3D(Layer):
     # Input shape
         5D tensor with shape:
         - If `data_format` is `"channels_last"`:
-            `(batch, first_axis_to_crop, second_axis_to_crop, third_axis_to_crop, depth)`
+            `(batch, first_axis_to_crop, second_axis_to_crop, third_axis_to_crop,
+              depth)`
         - If `data_format` is `"channels_first"`:
-            `(batch, depth, first_axis_to_crop, second_axis_to_crop, third_axis_to_crop)`
+            `(batch, depth,
+              first_axis_to_crop, second_axis_to_crop, third_axis_to_crop)`
 
     # Output shape
         5D tensor with shape:
         - If `data_format` is `"channels_last"`:
-            `(batch, first_cropped_axis, second_cropped_axis, third_cropped_axis, depth)`
+            `(batch, first_cropped_axis, second_cropped_axis, third_cropped_axis,
+              depth)`
         - If `data_format` is `"channels_first"`:
-            `(batch, depth, first_cropped_axis, second_cropped_axis, third_cropped_axis)`
+            `(batch, depth,
+              first_cropped_axis, second_cropped_axis, third_cropped_axis)`
     """
 
     @interfaces.legacy_cropping3d_support
     def __init__(self, cropping=((1, 1), (1, 1), (1, 1)),
                  data_format=None, **kwargs):
-        super(Cropping3D, self).__init__(**kwargs)
         self.data_format = K.normalize_data_format(data_format)
         if isinstance(cropping, int):
-            self.cropping = ((cropping, cropping),
-                             (cropping, cropping),
-                             (cropping, cropping))
+            normalized_cropping = ((cropping, cropping),
+                                   (cropping, cropping),
+                                   (cropping, cropping))
         elif hasattr(cropping, '__len__'):
             if len(cropping) != 3:
                 raise ValueError('`cropping` should have 3 elements. '
@@ -2527,59 +2572,19 @@ def __init__(self, cropping=((1, 1), (1, 1), (1, 1)),
                                                        '2nd entry of cropping')
             dim3_cropping = conv_utils.normalize_tuple(cropping[2], 2,
                                                        '3rd entry of cropping')
-            self.cropping = (dim1_cropping, dim2_cropping, dim3_cropping)
-        else:
-            raise ValueError('`cropping` should be either an int, '
-                             'a tuple of 3 ints '
-                             '(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop), '
-                             'or a tuple of 3 tuples of 2 ints '
-                             '((left_dim1_crop, right_dim1_crop),'
-                             ' (left_dim2_crop, right_dim2_crop),'
-                             ' (left_dim3_crop, right_dim2_crop)). '
-                             'Found: ' + str(cropping))
-        self.input_spec = InputSpec(ndim=5)
-
-    def compute_output_shape(self, input_shape):
-        return _compute_output_shape_cropping(input_shape,
-                                              self.data_format,
-                                              self.cropping)
-
-    def call(self, inputs):
-        return _call_cropping(inputs, self.data_format, self.cropping)
-
-    def get_config(self):
-        config = {'cropping': self.cropping,
-                  'data_format': self.data_format}
-        base_config = super(Cropping3D, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-
-def _call_cropping(inputs, data_format, cropping):
-    slices_dims = []
-    for start, end in cropping:
-        if end == 0:
-            end = None
+            normalized_cropping = (dim1_cropping, dim2_cropping, dim3_cropping)
         else:
-            end = -end
-        slices_dims.append(slice(start, end))
-
-    slices = [slice(None)] + slices_dims + [slice(None)]
-    slices = tuple(slices)
-    spatial_axes = list(range(1, 1 + len(cropping)))
-    slices = transpose_shape(slices, data_format, spatial_axes)
-    return inputs[slices]
-
-
-def _compute_output_shape_cropping(input_shape, data_format, cropping):
-    cropping_all_dims = ((0, 0),) + cropping + ((0, 0),)
-    spatial_axes = list(range(1, 1 + len(cropping)))
-    cropping_all_dims = transpose_shape(cropping_all_dims, data_format, spatial_axes)
-
-    output_shape = list(input_shape)
-    for dim in range(len(output_shape)):
-        if output_shape[dim] is not None:
-            output_shape[dim] -= sum(cropping_all_dims[dim])
-    return tuple(output_shape)
+            raise ValueError(
+                '`cropping` should be either an int, a tuple of 3 ints '
+                '(symmetric_dim1_crop, symmetric_dim2_crop, symmetric_dim3_crop), '
+                'or a tuple of 3 tuples of 2 ints '
+                '((left_dim1_crop, right_dim1_crop),'
+                ' (left_dim2_crop, right_dim2_crop),'
+                ' (left_dim3_crop, right_dim2_crop)). '
+                'Found: ' + str(cropping))
+        super(Cropping3D, self).__init__(normalized_cropping,
+                                         data_format,
+                                         **kwargs)
 
 
 # Aliases
diff --git a/keras/layers/convolutional_recurrent.py b/keras/layers/convolutional_recurrent.py
index dc1f7bbe5a3..0907a439e7c 100644
--- a/keras/layers/convolutional_recurrent.py
+++ b/keras/layers/convolutional_recurrent.py
@@ -21,6 +21,7 @@
 from ..legacy.layers import Recurrent, ConvRecurrent2D
 from .recurrent import RNN
 from ..utils.generic_utils import has_arg
+from ..utils.generic_utils import to_list
 from ..utils.generic_utils import transpose_shape
 
 
@@ -30,17 +31,15 @@ class ConvRNN2D(RNN):
     # Arguments
         cell: A RNN cell instance. A RNN cell is a class that has:
             - a `call(input_at_t, states_at_t)` method, returning
-                `(output_at_t, states_at_t_plus_1)`. The call method of the
-                cell can also take the optional argument `constants`, see
-                section "Note on passing external constants" below.
-            - a `state_size` attribute. This can be a single integer
-                (single state) in which case it is
-                the number of channels of the recurrent state
-                (which should be the same as the number of channels of the cell output).
-                This can also be a list/tuple of integers
-                (one size per state). In this case, the first entry
-                (`state_size[0]`) should be the same as
-                the size of the cell output.
+              `(output_at_t, states_at_t_plus_1)`. The call method of the
+              cell can also take the optional argument `constants`, see
+              section "Note on passing external constants" below.
+            - a `state_size` attribute. This can be a single integer (single state)
+              in which case it is the number of channels of the recurrent state
+              (which should be the same as the number of channels of the cell
+              output). This can also be a list/tuple of integers
+              (one size per state). In this case, the first entry (`state_size[0]`)
+              should be the same as the size of the cell output.
         return_sequences: Boolean. Whether to return the last output.
             in the output sequence, or the full sequence.
         return_state: Boolean. Whether to return the last state
@@ -64,14 +63,18 @@ class ConvRNN2D(RNN):
         - if `return_state`: a list of tensors. The first tensor is
             the output. The remaining tensors are the last states,
             each 5D tensor with shape:
-            `(samples, timesteps, filters, new_rows, new_cols)` if data_format='channels_first'
+            `(samples, timesteps,
+              filters, new_rows, new_cols)` if data_format='channels_first'
             or 5D tensor with shape:
-            `(samples, timesteps, new_rows, new_cols, filters)` if data_format='channels_last'.
+            `(samples, timesteps,
+              new_rows, new_cols, filters)` if data_format='channels_last'.
             `rows` and `cols` values might have changed due to padding.
         - if `return_sequences`: 5D tensor with shape:
-            `(samples, timesteps, filters, new_rows, new_cols)` if data_format='channels_first'
+            `(samples, timesteps,
+              filters, new_rows, new_cols)` if data_format='channels_first'
             or 5D tensor with shape:
-            `(samples, timesteps, new_rows, new_cols, filters)` if data_format='channels_last'.
+            `(samples, timesteps,
+              new_rows, new_cols, filters)` if data_format='channels_last'.
         - else, 4D tensor with shape:
             `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
             or 4D tensor with shape:
@@ -223,7 +226,8 @@ def build(self, input_shape):
                     'An initial_state was passed that is not compatible with '
                     '`cell.state_size`. Received `state_spec`={}; '
                     'However `cell.state_size` is '
-                    '{}'.format([spec.shape for spec in self.state_spec], self.cell.state_size))
+                    '{}'.format([spec.shape for spec in self.state_spec],
+                                self.cell.state_size))
         else:
             if self.cell.data_format == 'channels_first':
                 self.state_spec = [InputSpec(shape=(None, dim, None, None))
@@ -387,10 +391,7 @@ def step(inputs, states):
             output._uses_learning_phase = True
 
         if self.return_state:
-            if not isinstance(states, (list, tuple)):
-                states = [states]
-            else:
-                states = list(states)
+            states = to_list(states, allow_tuple=True)
             return [output] + states
         else:
             return output
@@ -415,7 +416,8 @@ def reset_states(self, states=None):
                              '- If using the functional API, specify '
                              'the time dimension by passing a '
                              '`batch_shape` argument to your Input layer.\n'
-                             'The same thing goes for the number of rows and columns.')
+                             'The same thing goes for the number of rows '
+                             'and columns.')
 
         # helper function
         def get_tuple_shape(nb_channels):
@@ -443,12 +445,11 @@ def get_tuple_shape(nb_channels):
                 K.set_value(self.states[0],
                             np.zeros(get_tuple_shape(self.cell.state_size)))
         else:
-            if not isinstance(states, (list, tuple)):
-                states = [states]
+            states = to_list(states, allow_tuple=True)
             if len(states) != len(self.states):
                 raise ValueError('Layer ' + self.name + ' expects ' +
                                  str(len(self.states)) + ' states, '
-                                                         'but it received ' + str(len(states)) +
+                                 'but it received ' + str(len(states)) +
                                  ' state values. Input received: ' +
                                  str(states))
             for index, (value, state) in enumerate(zip(states, self.states)):
@@ -508,7 +509,8 @@ class ConvLSTM2DCell(Layer):
         unit_forget_bias: Boolean.
             If True, add 1 to the bias of the forget gate at initialization.
             Use in combination with `bias_initializer="zeros"`.
-            This is recommended in [Jozefowicz et al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+            This is recommended in [Jozefowicz et al. (2015)](
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
         kernel_regularizer: Regularizer function applied to
             the `kernel` weights matrix
             (see [regularizer](../regularizers.md)).
@@ -561,7 +563,8 @@ def __init__(self, filters,
         self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
         self.padding = conv_utils.normalize_padding(padding)
         self.data_format = K.normalize_data_format(data_format)
-        self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2, 'dilation_rate')
+        self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
+                                                        'dilation_rate')
         self.activation = activations.get(activation)
         self.recurrent_activation = activations.get(recurrent_activation)
         self.use_bias = use_bias
@@ -639,9 +642,11 @@ def bias_initializer(_, *args, **kwargs):
         self.kernel_i = self.kernel[:, :, :, :self.filters]
         self.recurrent_kernel_i = self.recurrent_kernel[:, :, :, :self.filters]
         self.kernel_f = self.kernel[:, :, :, self.filters: self.filters * 2]
-        self.recurrent_kernel_f = self.recurrent_kernel[:, :, :, self.filters: self.filters * 2]
+        self.recurrent_kernel_f = (
+            self.recurrent_kernel[:, :, :, self.filters: self.filters * 2])
         self.kernel_c = self.kernel[:, :, :, self.filters * 2: self.filters * 3]
-        self.recurrent_kernel_c = self.recurrent_kernel[:, :, :, self.filters * 2: self.filters * 3]
+        self.recurrent_kernel_c = (
+            self.recurrent_kernel[:, :, :, self.filters * 2: self.filters * 3])
         self.kernel_o = self.kernel[:, :, :, self.filters * 3:]
         self.recurrent_kernel_o = self.recurrent_kernel[:, :, :, self.filters * 3:]
 
@@ -755,17 +760,24 @@ def get_config(self):
                   'data_format': self.data_format,
                   'dilation_rate': self.dilation_rate,
                   'activation': activations.serialize(self.activation),
-                  'recurrent_activation': activations.serialize(self.recurrent_activation),
+                  'recurrent_activation':
+                      activations.serialize(self.recurrent_activation),
                   'use_bias': self.use_bias,
-                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
-                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
+                  'kernel_initializer':
+                      initializers.serialize(self.kernel_initializer),
+                  'recurrent_initializer':
+                      initializers.serialize(self.recurrent_initializer),
                   'bias_initializer': initializers.serialize(self.bias_initializer),
                   'unit_forget_bias': self.unit_forget_bias,
-                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
+                  'kernel_regularizer':
+                      regularizers.serialize(self.kernel_regularizer),
+                  'recurrent_regularizer':
+                      regularizers.serialize(self.recurrent_regularizer),
                   'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-                  'kernel_constraint': constraints.serialize(self.kernel_constraint),
-                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
+                  'kernel_constraint':
+                      constraints.serialize(self.kernel_constraint),
+                  'recurrent_constraint':
+                      constraints.serialize(self.recurrent_constraint),
                   'bias_constraint': constraints.serialize(self.bias_constraint),
                   'dropout': self.dropout,
                   'recurrent_dropout': self.recurrent_dropout}
@@ -823,7 +835,8 @@ class ConvLSTM2D(ConvRNN2D):
         unit_forget_bias: Boolean.
             If True, add 1 to the bias of the forget gate at initialization.
             Use in combination with `bias_initializer="zeros"`.
-            This is recommended in [Jozefowicz et al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+            This is recommended in [Jozefowicz et al. (2015)](
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
         kernel_regularizer: Regularizer function applied to
             the `kernel` weights matrix
             (see [regularizer](../regularizers.md)).
@@ -888,9 +901,9 @@ class ConvLSTM2D(ConvRNN2D):
 
     # References
         - [Convolutional LSTM Network: A Machine Learning Approach for
-        Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
-        The current implementation does not include the feedback loop on the
-        cells output
+          Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
+          The current implementation does not include the feedback loop on the
+          cells output
     """
 
     @interfaces.legacy_convlstm2d_support
@@ -1046,18 +1059,26 @@ def get_config(self):
                   'data_format': self.data_format,
                   'dilation_rate': self.dilation_rate,
                   'activation': activations.serialize(self.activation),
-                  'recurrent_activation': activations.serialize(self.recurrent_activation),
+                  'recurrent_activation':
+                      activations.serialize(self.recurrent_activation),
                   'use_bias': self.use_bias,
-                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
-                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
+                  'kernel_initializer':
+                      initializers.serialize(self.kernel_initializer),
+                  'recurrent_initializer':
+                      initializers.serialize(self.recurrent_initializer),
                   'bias_initializer': initializers.serialize(self.bias_initializer),
                   'unit_forget_bias': self.unit_forget_bias,
-                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
+                  'kernel_regularizer':
+                      regularizers.serialize(self.kernel_regularizer),
+                  'recurrent_regularizer':
+                      regularizers.serialize(self.recurrent_regularizer),
                   'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-                  'activity_regularizer': regularizers.serialize(self.activity_regularizer),
-                  'kernel_constraint': constraints.serialize(self.kernel_constraint),
-                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
+                  'activity_regularizer':
+                      regularizers.serialize(self.activity_regularizer),
+                  'kernel_constraint':
+                      constraints.serialize(self.kernel_constraint),
+                  'recurrent_constraint':
+                      constraints.serialize(self.recurrent_constraint),
                   'bias_constraint': constraints.serialize(self.bias_constraint),
                   'dropout': self.dropout,
                   'recurrent_dropout': self.recurrent_dropout}
diff --git a/keras/layers/core.py b/keras/layers/core.py
index 0c55f8db8eb..db8dbfb7800 100644
--- a/keras/layers/core.py
+++ b/keras/layers/core.py
@@ -95,7 +95,8 @@ class Dropout(Layer):
         seed: A Python integer to use as random seed.
 
     # References
-        - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
+        - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting]
+          (http://www.jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf)
     """
     @interfaces.legacy_dropout_support
     def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
@@ -158,7 +159,8 @@ class SpatialDropout1D(Dropout):
         Same as input
 
     # References
-        - [Efficient Object Localization Using Convolutional Networks](https://arxiv.org/abs/1411.4280)
+        - [Efficient Object Localization Using Convolutional Networks]
+          (https://arxiv.org/abs/1411.4280)
     """
 
     @interfaces.legacy_spatialdropout1d_support
@@ -203,7 +205,8 @@ class SpatialDropout2D(Dropout):
         Same as input
 
     # References
-        - [Efficient Object Localization Using Convolutional Networks](https://arxiv.org/abs/1411.4280)
+        - [Efficient Object Localization Using Convolutional Networks]
+          (https://arxiv.org/abs/1411.4280)
     """
 
     @interfaces.legacy_spatialdropoutNd_support
@@ -251,7 +254,8 @@ class SpatialDropout3D(Dropout):
         Same as input
 
     # References
-        - [Efficient Object Localization Using Convolutional Networks](https://arxiv.org/abs/1411.4280)
+        - [Efficient Object Localization Using Convolutional Networks]
+          (https://arxiv.org/abs/1411.4280)
     """
 
     @interfaces.legacy_spatialdropoutNd_support
@@ -669,7 +673,8 @@ def compute_output_shape(self, input_shape):
         else:
             shape = self._output_shape(input_shape)
             if not isinstance(shape, (list, tuple)):
-                raise ValueError('`output_shape` function must return a tuple or a list of tuples.')
+                raise ValueError('`output_shape` function must return a tuple or '
+                                 'a list of tuples.')
             if isinstance(shape, list):
                 if isinstance(shape[0], int) or shape[0] is None:
                     shape = tuple(shape)
@@ -894,7 +899,8 @@ def get_config(self):
             'bias_initializer': initializers.serialize(self.bias_initializer),
             'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
             'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+            'activity_regularizer':
+                regularizers.serialize(self.activity_regularizer),
             'kernel_constraint': constraints.serialize(self.kernel_constraint),
             'bias_constraint': constraints.serialize(self.bias_constraint)
         }
diff --git a/keras/layers/cudnn_recurrent.py b/keras/layers/cudnn_recurrent.py
index b9796ceac70..5dae25e26fd 100644
--- a/keras/layers/cudnn_recurrent.py
+++ b/keras/layers/cudnn_recurrent.py
@@ -308,21 +308,25 @@ def get_config(self):
         config = {
             'units': self.units,
             'kernel_initializer': initializers.serialize(self.kernel_initializer),
-            'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
+            'recurrent_initializer':
+                initializers.serialize(self.recurrent_initializer),
             'bias_initializer': initializers.serialize(self.bias_initializer),
             'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-            'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
+            'recurrent_regularizer':
+                regularizers.serialize(self.recurrent_regularizer),
             'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+            'activity_regularizer':
+                regularizers.serialize(self.activity_regularizer),
             'kernel_constraint': constraints.serialize(self.kernel_constraint),
-            'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
+            'recurrent_constraint':
+                constraints.serialize(self.recurrent_constraint),
             'bias_constraint': constraints.serialize(self.bias_constraint)}
         base_config = super(CuDNNGRU, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
 
 class CuDNNLSTM(_CuDNNRNN):
-    """Fast LSTM implementation backed by [CuDNN](https://developer.nvidia.com/cudnn).
+    """Fast LSTM implementation with [CuDNN](https://developer.nvidia.com/cudnn).
 
     Can only be run on GPU, with the TensorFlow backend.
 
@@ -334,7 +338,8 @@ class CuDNNLSTM(_CuDNNRNN):
         unit_forget_bias: Boolean.
             If True, add 1 to the bias of the forget gate at initialization.
             Setting it to true will also force `bias_initializer="zeros"`.
-            This is recommended in [Jozefowicz et al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+            This is recommended in [Jozefowicz et al. (2015)](
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
         recurrent_initializer: Initializer for the `recurrent_kernel`
             weights matrix,
             used for the linear transformation of the recurrent state.
@@ -457,8 +462,10 @@ def bias_initializer(shape, *args, **kwargs):
         self.kernel_o = self.kernel[:, self.units * 3:]
 
         self.recurrent_kernel_i = self.recurrent_kernel[:, :self.units]
-        self.recurrent_kernel_f = self.recurrent_kernel[:, self.units: self.units * 2]
-        self.recurrent_kernel_c = self.recurrent_kernel[:, self.units * 2: self.units * 3]
+        self.recurrent_kernel_f = (
+            self.recurrent_kernel[:, self.units: self.units * 2])
+        self.recurrent_kernel_c = (
+            self.recurrent_kernel[:, self.units * 2: self.units * 3])
         self.recurrent_kernel_o = self.recurrent_kernel[:, self.units * 3:]
 
         self.bias_i_i = self.bias[:self.units]
@@ -522,13 +529,16 @@ def get_config(self):
         config = {
             'units': self.units,
             'kernel_initializer': initializers.serialize(self.kernel_initializer),
-            'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
+            'recurrent_initializer':
+                initializers.serialize(self.recurrent_initializer),
             'bias_initializer': initializers.serialize(self.bias_initializer),
             'unit_forget_bias': self.unit_forget_bias,
             'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-            'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
+            'recurrent_regularizer':
+                regularizers.serialize(self.recurrent_regularizer),
             'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+            'activity_regularizer':
+                regularizers.serialize(self.activity_regularizer),
             'kernel_constraint': constraints.serialize(self.kernel_constraint),
             'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
             'bias_constraint': constraints.serialize(self.bias_constraint)}
diff --git a/keras/layers/embeddings.py b/keras/layers/embeddings.py
index 09be4da4f2e..7cc235d4fd4 100644
--- a/keras/layers/embeddings.py
+++ b/keras/layers/embeddings.py
@@ -11,6 +11,7 @@
 from .. import constraints
 from ..engine.base_layer import Layer
 from ..legacy import interfaces
+from ..utils.generic_utils import to_list
 
 
 class Embedding(Layer):
@@ -25,7 +26,8 @@ class Embedding(Layer):
       model = Sequential()
       model.add(Embedding(1000, 64, input_length=10))
       # the model will take as input an integer matrix of size (batch, input_length).
-      # the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
+      # the largest integer (i.e. word index) in the input should be
+      # no larger than 999 (vocabulary size).
       # now model.output_shape == (None, 10, 64), where None is the batch dimension.
 
       input_array = np.random.randint(1000, size=(32, 10))
@@ -72,7 +74,8 @@ class Embedding(Layer):
         3D tensor with shape: `(batch_size, sequence_length, output_dim)`.
 
     # References
-        - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
+        - [A Theoretically Grounded Application of Dropout in
+           Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
     """
 
     @interfaces.legacy_embedding_support
@@ -123,18 +126,17 @@ def compute_output_shape(self, input_shape):
             return input_shape + (self.output_dim,)
         else:
             # input_length can be tuple if input is 3D or higher
-            if isinstance(self.input_length, (list, tuple)):
-                in_lens = list(self.input_length)
-            else:
-                in_lens = [self.input_length]
+            in_lens = to_list(self.input_length, allow_tuple=True)
             if len(in_lens) != len(input_shape) - 1:
-                ValueError('"input_length" is %s, but received input has shape %s' %
-                           (str(self.input_length), str(input_shape)))
+                raise ValueError(
+                    '"input_length" is %s, but received input has shape %s' %
+                    (str(self.input_length), str(input_shape)))
             else:
                 for i, (s1, s2) in enumerate(zip(in_lens, input_shape[1:])):
                     if s1 is not None and s2 is not None and s1 != s2:
-                        ValueError('"input_length" is %s, but received input has shape %s' %
-                                   (str(self.input_length), str(input_shape)))
+                        raise ValueError(
+                            '"input_length" is %s, but received input has shape %s' %
+                            (str(self.input_length), str(input_shape)))
                     elif s1 is None:
                         in_lens[i] = s2
             return (input_shape[0],) + tuple(in_lens) + (self.output_dim,)
@@ -147,7 +149,8 @@ def call(self, inputs):
         # Refer to this issue: https://github.com/awslabs/keras-apache-mxnet/issues/63
         if K.backend() == "mxnet":
             if self.sparse_grad:
-                out = K.embedding(inputs, self.embeddings, self.input_dim, self.output_dim, sparse_grad=self.sparse_grad)
+                out = K.embedding(inputs, self.embeddings, self.input_dim, self.output_dim,
+                                  sparse_grad=self.sparse_grad)
             else:
                 out = K.embedding(inputs, self.embeddings, self.input_dim, self.output_dim)
         else:
@@ -157,10 +160,14 @@ def call(self, inputs):
     def get_config(self):
         config = {'input_dim': self.input_dim,
                   'output_dim': self.output_dim,
-                  'embeddings_initializer': initializers.serialize(self.embeddings_initializer),
-                  'embeddings_regularizer': regularizers.serialize(self.embeddings_regularizer),
-                  'activity_regularizer': regularizers.serialize(self.activity_regularizer),
-                  'embeddings_constraint': constraints.serialize(self.embeddings_constraint),
+                  'embeddings_initializer':
+                      initializers.serialize(self.embeddings_initializer),
+                  'embeddings_regularizer':
+                      regularizers.serialize(self.embeddings_regularizer),
+                  'activity_regularizer':
+                      regularizers.serialize(self.activity_regularizer),
+                  'embeddings_constraint':
+                      constraints.serialize(self.embeddings_constraint),
                   'mask_zero': self.mask_zero,
                   'input_length': self.input_length}
         base_config = super(Embedding, self).get_config()
diff --git a/keras/layers/local.py b/keras/layers/local.py
index 8833a60495d..b7c9cabfecf 100644
--- a/keras/layers/local.py
+++ b/keras/layers/local.py
@@ -170,7 +170,8 @@ def get_config(self):
             'bias_initializer': initializers.serialize(self.bias_initializer),
             'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
             'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+            'activity_regularizer':
+                regularizers.serialize(self.activity_regularizer),
             'kernel_constraint': constraints.serialize(self.kernel_constraint),
             'bias_constraint': constraints.serialize(self.bias_constraint)
         }
@@ -188,12 +189,13 @@ class LocallyConnected2D(Layer):
 
     # Examples
     ```python
-        # apply a 3x3 unshared weights convolution with 64 output filters on a 32x32 image
-        # with `data_format="channels_last"`:
+        # apply a 3x3 unshared weights convolution with 64 output filters
+        # on a 32x32 image with `data_format="channels_last"`:
         model = Sequential()
         model.add(LocallyConnected2D(64, (3, 3), input_shape=(32, 32, 3)))
         # now model.output_shape == (None, 30, 30, 64)
-        # notice that this layer will consume (30*30)*(3*3*3*64) + (30*30)*64 parameters
+        # notice that this layer will consume (30*30)*(3*3*3*64)
+        # + (30*30)*64 parameters
 
         # add a 3x3 unshared weights convolution on top, with 32 output filters:
         model.add(LocallyConnected2D(32, (3, 3)))
@@ -313,9 +315,10 @@ def build(self, input_shape):
                                                    self.padding, self.strides[1])
         self.output_row = output_row
         self.output_col = output_col
-        self.kernel_shape = (output_row * output_col,
-                             self.kernel_size[0] * self.kernel_size[1] * input_filter,
-                             self.filters)
+        self.kernel_shape = (
+            output_row * output_col,
+            self.kernel_size[0] * self.kernel_size[1] * input_filter,
+            self.filters)
         self.kernel = self.add_weight(shape=self.kernel_shape,
                                       initializer=self.kernel_initializer,
                                       name='kernel',
@@ -380,7 +383,8 @@ def get_config(self):
             'bias_initializer': initializers.serialize(self.bias_initializer),
             'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
             'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+            'activity_regularizer':
+                regularizers.serialize(self.activity_regularizer),
             'kernel_constraint': constraints.serialize(self.kernel_constraint),
             'bias_constraint': constraints.serialize(self.bias_constraint)
         }
diff --git a/keras/layers/merge.py b/keras/layers/merge.py
index 2b1cf687d60..a0d57459968 100644
--- a/keras/layers/merge.py
+++ b/keras/layers/merge.py
@@ -87,7 +87,8 @@ def build(self, input_shape):
                 shape = None
             else:
                 shape = input_shape[i][1:]
-            output_shape = self._compute_elemwise_op_output_shape(output_shape, shape)
+            output_shape = self._compute_elemwise_op_output_shape(output_shape,
+                                                                  shape)
         # If the inputs have different ranks, we have to reshape them
         # to make them broadcastable.
         if None not in input_shape and len(set(map(len, input_shape))) == 1:
@@ -122,8 +123,10 @@ def call(self, inputs):
                     if x_ndim is None:
                         x_shape = K.shape(x)
                         batch_size = x_shape[0]
-                        new_shape = K.concatenate([x_shape[1:], K.expand_dims(batch_size)])
-                        x_transposed = K.reshape(x, K.stack([batch_size, K.prod(x_shape[1:])]))
+                        new_shape = K.concatenate([x_shape[1:],
+                                                   K.expand_dims(batch_size)])
+                        x_transposed = K.reshape(x, K.stack([batch_size,
+                                                             K.prod(x_shape[1:])]))
                         x_transposed = K.permute_dimensions(x_transposed, (1, 0))
                         x_transposed = K.reshape(x_transposed, new_shape)
                         reshaped_inputs.append(x_transposed)
@@ -133,17 +136,20 @@ def call(self, inputs):
                         reshaped_inputs.append(K.permute_dimensions(x, dims))
                         transposed = True
                     else:
-                        # We don't transpose inputs if they are 1D vectors or scalars.
+                        # We don't transpose inputs if they are
+                        # 1D vectors or scalars.
                         reshaped_inputs.append(x)
                 y = self._merge_function(reshaped_inputs)
                 y_ndim = K.ndim(y)
                 if transposed:
-                    # If inputs have been transposed, we have to transpose the output too.
+                    # If inputs have been transposed,
+                    # we have to transpose the output too.
                     if y_ndim is None:
                         y_shape = K.shape(y)
                         y_ndim = K.shape(y_shape)[0]
                         batch_size = y_shape[y_ndim - 1]
-                        new_shape = K.concatenate([K.expand_dims(batch_size), y_shape[:y_ndim - 1]])
+                        new_shape = K.concatenate([K.expand_dims(batch_size),
+                                                   y_shape[:y_ndim - 1]])
                         y = K.reshape(y, (-1, batch_size))
                         y = K.permute_dimensions(y, (1, 0))
                         y = K.reshape(y, new_shape)
@@ -164,7 +170,8 @@ def compute_output_shape(self, input_shape):
                 shape = None
             else:
                 shape = input_shape[i][1:]
-            output_shape = self._compute_elemwise_op_output_shape(output_shape, shape)
+            output_shape = self._compute_elemwise_op_output_shape(output_shape,
+                                                                  shape)
         batch_sizes = [s[0] for s in input_shape if s is not None]
         batch_sizes = set(batch_sizes)
         batch_sizes -= set([None])
@@ -206,7 +213,8 @@ class Add(_Merge):
         x1 = keras.layers.Dense(8, activation='relu')(input1)
         input2 = keras.layers.Input(shape=(32,))
         x2 = keras.layers.Dense(8, activation='relu')(input2)
-        added = keras.layers.Add()([x1, x2])  # equivalent to added = keras.layers.add([x1, x2])
+        # equivalent to added = keras.layers.add([x1, x2])
+        added = keras.layers.Add()([x1, x2])
 
         out = keras.layers.Dense(4)(added)
         model = keras.models.Model(inputs=[input1, input2], outputs=out)
diff --git a/keras/layers/noise.py b/keras/layers/noise.py
index 7fe99669655..78122083f11 100644
--- a/keras/layers/noise.py
+++ b/keras/layers/noise.py
@@ -74,7 +74,8 @@ class GaussianDropout(Layer):
         Same shape as input.
 
     # References
-        - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting Srivastava, Hinton, et al. 2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
+        - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting]
+          (http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
     """
 
     @interfaces.legacy_gaussiandropout_support
diff --git a/keras/layers/normalization.py b/keras/layers/normalization.py
index 6deba3a5a8f..e536cf07fcc 100644
--- a/keras/layers/normalization.py
+++ b/keras/layers/normalization.py
@@ -53,7 +53,8 @@ class BatchNormalization(Layer):
         Same shape as input.
 
     # References
-        - [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
+        - [Batch Normalization: Accelerating Deep Network Training by
+           Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
     """
 
     @interfaces.legacy_batchnorm_support
@@ -81,7 +82,8 @@ def __init__(self,
         self.beta_initializer = initializers.get(beta_initializer)
         self.gamma_initializer = initializers.get(gamma_initializer)
         self.moving_mean_initializer = initializers.get(moving_mean_initializer)
-        self.moving_variance_initializer = initializers.get(moving_variance_initializer)
+        self.moving_variance_initializer = (
+            initializers.get(moving_variance_initializer))
         self.beta_regularizer = regularizers.get(beta_regularizer)
         self.gamma_regularizer = regularizers.get(gamma_regularizer)
         self.beta_constraint = constraints.get(beta_constraint)
@@ -244,8 +246,10 @@ def get_config(self):
             'scale': self.scale,
             'beta_initializer': initializers.serialize(self.beta_initializer),
             'gamma_initializer': initializers.serialize(self.gamma_initializer),
-            'moving_mean_initializer': initializers.serialize(self.moving_mean_initializer),
-            'moving_variance_initializer': initializers.serialize(self.moving_variance_initializer),
+            'moving_mean_initializer':
+                initializers.serialize(self.moving_mean_initializer),
+            'moving_variance_initializer':
+                initializers.serialize(self.moving_variance_initializer),
             'beta_regularizer': regularizers.serialize(self.beta_regularizer),
             'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
             'beta_constraint': constraints.serialize(self.beta_constraint),
diff --git a/keras/layers/pooling.py b/keras/layers/pooling.py
index 6df9428be1e..f2ce5d7e837 100644
--- a/keras/layers/pooling.py
+++ b/keras/layers/pooling.py
@@ -17,39 +17,51 @@ class _Pooling1D(Layer):
     """
 
     def __init__(self, pool_size=2, strides=None,
-                 padding='valid', **kwargs):
+                 padding='valid', data_format='channels_last', **kwargs):
         super(_Pooling1D, self).__init__(**kwargs)
         if strides is None:
             strides = pool_size
         self.pool_size = conv_utils.normalize_tuple(pool_size, 1, 'pool_size')
         self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
         self.padding = conv_utils.normalize_padding(padding)
+        self.data_format = K.normalize_data_format(data_format)
         self.input_spec = InputSpec(ndim=3)
 
     def compute_output_shape(self, input_shape):
-        length = conv_utils.conv_output_length(input_shape[1],
+        if self.data_format == 'channels_first':
+            steps = input_shape[2]
+            features = input_shape[1]
+        else:
+            steps = input_shape[1]
+            features = input_shape[2]
+        length = conv_utils.conv_output_length(steps,
                                                self.pool_size[0],
                                                self.padding,
                                                self.strides[0])
-        return (input_shape[0], length, input_shape[2])
+        if self.data_format == 'channels_first':
+            return (input_shape[0], features, length)
+        else:
+            return (input_shape[0], length, features)
 
     def _pooling_function(self, inputs, pool_size, strides,
                           padding, data_format):
         raise NotImplementedError
 
     def call(self, inputs):
-        inputs = K.expand_dims(inputs, 2)   # add dummy last dimension
+        dummy_axis = 2 if self.data_format == 'channels_last' else 3
+        inputs = K.expand_dims(inputs, dummy_axis)   # add dummy last dimension
         output = self._pooling_function(inputs=inputs,
                                         pool_size=self.pool_size + (1,),
                                         strides=self.strides + (1,),
                                         padding=self.padding,
-                                        data_format='channels_last')
-        return K.squeeze(output, 2)  # remove dummy last dimension
+                                        data_format=self.data_format)
+        return K.squeeze(output, dummy_axis)  # remove dummy last dimension
 
     def get_config(self):
         config = {'strides': self.strides,
                   'pool_size': self.pool_size,
-                  'padding': self.padding}
+                  'padding': self.padding,
+                  'data_format': self.data_format}
         base_config = super(_Pooling1D, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
@@ -63,19 +75,37 @@ class MaxPooling1D(_Pooling1D):
             E.g. 2 will halve the input.
             If None, it will default to `pool_size`.
         padding: One of `"valid"` or `"same"` (case-insensitive).
+        data_format: A string,
+            one of `channels_last` (default) or `channels_first`.
+            The ordering of the dimensions in the inputs.
+            `channels_last` corresponds to inputs with shape
+            `(batch, steps, features)` while `channels_first`
+            corresponds to inputs with shape
+            `(batch, features, steps)`.
 
     # Input shape
-        3D tensor with shape: `(batch_size, steps, features)`.
+        - If `data_format='channels_last'`:
+            3D tensor with shape:
+            `(batch_size, steps, features)`
+        - If `data_format='channels_first'`:
+            3D tensor with shape:
+            `(batch_size, features, steps)`
 
     # Output shape
-        3D tensor with shape: `(batch_size, downsampled_steps, features)`.
+        - If `data_format='channels_last'`:
+            3D tensor with shape:
+            `(batch_size, downsampled_steps, features)`
+        - If `data_format='channels_first'`:
+            3D tensor with shape:
+            `(batch_size, features, downsampled_steps)`
     """
 
     @interfaces.legacy_pooling1d_support
     def __init__(self, pool_size=2, strides=None,
-                 padding='valid', **kwargs):
+                 padding='valid', data_format='channels_last', **kwargs):
         super(MaxPooling1D, self).__init__(pool_size, strides,
-                                           padding, **kwargs)
+                                           padding, data_format,
+                                           **kwargs)
 
     def _pooling_function(self, inputs, pool_size, strides,
                           padding, data_format):
@@ -93,19 +123,37 @@ class AveragePooling1D(_Pooling1D):
             E.g. 2 will halve the input.
             If None, it will default to `pool_size`.
         padding: One of `"valid"` or `"same"` (case-insensitive).
+        data_format: A string,
+            one of `channels_last` (default) or `channels_first`.
+            The ordering of the dimensions in the inputs.
+            `channels_last` corresponds to inputs with shape
+            `(batch, steps, features)` while `channels_first`
+            corresponds to inputs with shape
+            `(batch, features, steps)`.
 
     # Input shape
-        3D tensor with shape: `(batch_size, steps, features)`.
+        - If `data_format='channels_last'`:
+            3D tensor with shape:
+            `(batch_size, steps, features)`
+        - If `data_format='channels_first'`:
+            3D tensor with shape:
+            `(batch_size, features, steps)`
 
     # Output shape
-        3D tensor with shape: `(batch_size, downsampled_steps, features)`.
+        - If `data_format='channels_last'`:
+            3D tensor with shape:
+            `(batch_size, downsampled_steps, features)`
+        - If `data_format='channels_first'`:
+            3D tensor with shape:
+            `(batch_size, features, downsampled_steps)`
     """
 
     @interfaces.legacy_pooling1d_support
     def __init__(self, pool_size=2, strides=None,
-                 padding='valid', **kwargs):
+                 padding='valid', data_format='channels_last', **kwargs):
         super(AveragePooling1D, self).__init__(pool_size, strides,
-                                               padding, **kwargs)
+                                               padding, data_format,
+                                               **kwargs)
 
     def _pooling_function(self, inputs, pool_size, strides,
                           padding, data_format):
@@ -440,37 +488,91 @@ class _GlobalPooling1D(Layer):
     """Abstract class for different global pooling 1D layers.
     """
 
-    def __init__(self, **kwargs):
+    def __init__(self, data_format='channels_last', **kwargs):
         super(_GlobalPooling1D, self).__init__(**kwargs)
         self.input_spec = InputSpec(ndim=3)
+        self.data_format = K.normalize_data_format(data_format)
 
     def compute_output_shape(self, input_shape):
-        return (input_shape[0], input_shape[2])
+        if self.data_format == 'channels_first':
+            return (input_shape[0], input_shape[1])
+        else:
+            return (input_shape[0], input_shape[2])
 
     def call(self, inputs):
         raise NotImplementedError
 
+    def get_config(self):
+        config = {'data_format': self.data_format}
+        base_config = super(_GlobalPooling1D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
 
 class GlobalAveragePooling1D(_GlobalPooling1D):
     """Global average pooling operation for temporal data.
 
+    # Arguments
+        data_format: A string,
+            one of `channels_last` (default) or `channels_first`.
+            The ordering of the dimensions in the inputs.
+            `channels_last` corresponds to inputs with shape
+            `(batch, steps, features)` while `channels_first`
+            corresponds to inputs with shape
+            `(batch, features, steps)`.
+
     # Input shape
-        3D tensor with shape: `(batch_size, steps, features)`.
+        - If `data_format='channels_last'`:
+            3D tensor with shape:
+            `(batch_size, steps, features)`
+        - If `data_format='channels_first'`:
+            3D tensor with shape:
+            `(batch_size, features, steps)`
 
     # Output shape
         2D tensor with shape:
         `(batch_size, features)`
     """
 
-    def call(self, inputs):
-        return K.mean(inputs, axis=1)
+    def __init__(self, data_format='channels_last', **kwargs):
+        super(GlobalAveragePooling1D, self).__init__(data_format,
+                                                     **kwargs)
+        self.supports_masking = True
+
+    def call(self, inputs, mask=None):
+        steps_axis = 1 if self.data_format == 'channels_last' else 2
+        if mask is not None:
+            mask = K.cast(mask, K.floatx())
+            input_shape = K.int_shape(inputs)
+            broadcast_shape = [-1, input_shape[steps_axis], 1]
+            mask = K.reshape(mask, broadcast_shape)
+            inputs *= mask
+            return K.sum(inputs, axis=steps_axis) / K.sum(mask, axis=steps_axis)
+        else:
+            return K.mean(inputs, axis=steps_axis)
+
+    def compute_mask(self, inputs, mask=None):
+        return None
 
 
 class GlobalMaxPooling1D(_GlobalPooling1D):
     """Global max pooling operation for temporal data.
 
+    # Arguments
+        data_format: A string,
+            one of `channels_last` (default) or `channels_first`.
+            The ordering of the dimensions in the inputs.
+            `channels_last` corresponds to inputs with shape
+            `(batch, steps, features)` while `channels_first`
+            corresponds to inputs with shape
+            `(batch, features, steps)`.
+
     # Input shape
-        3D tensor with shape: `(batch_size, steps, features)`.
+        - If `data_format='channels_last'`:
+            3D tensor with shape:
+            `(batch_size, steps, features)`
+        - If `data_format='channels_first'`:
+            3D tensor with shape:
+            `(batch_size, features, steps)`
 
     # Output shape
         2D tensor with shape:
@@ -478,7 +580,8 @@ class GlobalMaxPooling1D(_GlobalPooling1D):
     """
 
     def call(self, inputs):
-        return K.max(inputs, axis=1)
+        steps_axis = 1 if self.data_format == 'channels_last' else 2
+        return K.max(inputs, axis=steps_axis)
 
 
 class _GlobalPooling2D(Layer):
diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index 86c9c758f75..e60193aa77e 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -16,6 +16,7 @@
 from ..engine.base_layer import Layer
 from ..engine.base_layer import InputSpec
 from ..utils.generic_utils import has_arg
+from ..utils.generic_utils import to_list
 
 # Legacy support.
 from ..legacy.layers import Recurrent
@@ -54,36 +55,56 @@ def __init__(self, cells, **kwargs):
                                  '`state_size` attribute. '
                                  'received cells:', cells)
         self.cells = cells
+        # reverse_state_order determines whether the state size will be in a
+        # reverse order of the cells' state. User might want to set this to True
+        # to keep the existing behavior. This is only useful when use
+        # `RNN(return_state=True)` since the state will be returned as the same
+        # order of state_size.
+        self.reverse_state_order = kwargs.pop('reverse_state_order', False)
+        if self.reverse_state_order:
+            warnings.warn('`reverse_state_order=True` in `StackedRNNCells` '
+                          'will soon be deprecated. Please update the code to '
+                          'work with the natural order of states if you '
+                          'reply on the RNN states, '
+                          'eg `RNN(return_state=True)`.')
         super(StackedRNNCells, self).__init__(**kwargs)
 
     @property
     def state_size(self):
-        # States are a flat list
-        # in reverse order of the cell stack.
-        # This allows to preserve the requirement
-        # `stack.state_size[0] == output_dim`.
-        # e.g. states of a 2-layer LSTM would be
-        # `[h2, c2, h1, c1]`
+        # States are a flat list of the individual cell state size.
+        # e.g. states of a 2-layer LSTM would be `[h1, c1, h2, c2]`.
         # (assuming one LSTM has states [h, c])
+        # In the case of reverse_state_order=True, the state_size will be
+        # `[h2, c2, h1, c1]`.
         state_size = []
-        for cell in self.cells[::-1]:
+        for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
             if hasattr(cell.state_size, '__len__'):
                 state_size += list(cell.state_size)
             else:
                 state_size.append(cell.state_size)
         return tuple(state_size)
 
+    @property
+    def output_size(self):
+        if getattr(self.cells[-1], 'output_size', None) is not None:
+            return self.cells[-1].output_size
+        if hasattr(self.cells[-1].state_size, '__len__'):
+            return self.cells[-1].state_size[0]
+        else:
+            return self.cells[-1].state_size
+
     def call(self, inputs, states, constants=None, **kwargs):
         # Recover per-cell states.
         nested_states = []
-        for cell in self.cells[::-1]:
+        for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
             if hasattr(cell.state_size, '__len__'):
                 nested_states.append(states[:len(cell.state_size)])
                 states = states[len(cell.state_size):]
             else:
                 nested_states.append([states[0]])
                 states = states[1:]
-        nested_states = nested_states[::-1]
+        if self.reverse_state_order:
+            nested_states = nested_states[::-1]
 
         # Call the cells in order and store the returned states.
         new_nested_states = []
@@ -98,10 +119,12 @@ def call(self, inputs, states, constants=None, **kwargs):
 
         # Format the new states as a flat list
         # in reverse cell order.
-        states = []
-        for cell_states in new_nested_states[::-1]:
-            states += cell_states
-        return inputs, states
+        new_states = []
+        if self.reverse_state_order:
+            new_nested_states = new_nested_states[::-1]
+        for cell_states in new_nested_states:
+            new_states += cell_states
+        return inputs, new_states
 
     def build(self, input_shape):
         if isinstance(input_shape, list):
@@ -113,7 +136,9 @@ def build(self, input_shape):
                     cell.build([input_shape] + constants_shape)
                 else:
                     cell.build(input_shape)
-            if hasattr(cell.state_size, '__len__'):
+            if getattr(cell, 'output_size', None) is not None:
+                output_dim = cell.output_size
+            elif hasattr(cell.state_size, '__len__'):
                 output_dim = cell.state_size[0]
             else:
                 output_dim = cell.state_size
@@ -223,9 +248,12 @@ class RNN(Layer):
                 the size of the recurrent state
                 (which should be the same as the size of the cell output).
                 This can also be a list/tuple of integers
-                (one size per state). In this case, the first entry
-                (`state_size[0]`) should be the same as
-                the size of the cell output.
+                (one size per state).
+            - a `output_size` attribute. This can be a single integer or a
+                TensorShape, which represent the shape of the output. For
+                backward compatible reason, if this attribute is not available
+                for the cell, the value will be inferred by the first element
+                of the `state_size`.
             It is also possible for `cell` to be a list of RNN cell instances,
             in which cases the cells get stacked on after the other in the RNN,
             implementing an efficient stacked RNN.
@@ -414,7 +442,11 @@ def compute_output_shape(self, input_shape):
             state_size = self.cell.state_size
         else:
             state_size = [self.cell.state_size]
-        output_dim = state_size[0]
+
+        if getattr(self.cell, 'output_size', None) is not None:
+            output_dim = self.cell.output_size
+        else:
+            output_dim = state_size[0]
 
         if self.return_sequences:
             output_shape = (input_shape[0], input_shape[1], output_dim)
@@ -644,10 +676,7 @@ def step(inputs, states):
                 state._uses_learning_phase = True
 
         if self.return_state:
-            if not isinstance(states, (list, tuple)):
-                states = [states]
-            else:
-                states = list(states)
+            states = to_list(states, allow_tuple=True)
             return [output] + states
         else:
             return output
@@ -682,8 +711,7 @@ def reset_states(self, states=None):
                 K.set_value(self.states[0],
                             np.zeros((batch_size, self.cell.state_size)))
         else:
-            if not isinstance(states, (list, tuple)):
-                states = [states]
+            states = to_list(states, allow_tuple=True)
             if len(states) != len(self.states):
                 raise ValueError('Layer ' + self.name + ' expects ' +
                                  str(len(self.states)) + ' states, '
@@ -838,6 +866,7 @@ def __init__(self, units,
         self.dropout = min(1., max(0., dropout))
         self.recurrent_dropout = min(1., max(0., recurrent_dropout))
         self.state_size = self.units
+        self.output_size = self.units
         self._dropout_mask = None
         self._recurrent_dropout_mask = None
 
@@ -903,14 +932,19 @@ def get_config(self):
         config = {'units': self.units,
                   'activation': activations.serialize(self.activation),
                   'use_bias': self.use_bias,
-                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
-                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
+                  'kernel_initializer':
+                      initializers.serialize(self.kernel_initializer),
+                  'recurrent_initializer':
+                      initializers.serialize(self.recurrent_initializer),
                   'bias_initializer': initializers.serialize(self.bias_initializer),
-                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
+                  'kernel_regularizer':
+                      regularizers.serialize(self.kernel_regularizer),
+                  'recurrent_regularizer':
+                      regularizers.serialize(self.recurrent_regularizer),
                   'bias_regularizer': regularizers.serialize(self.bias_regularizer),
                   'kernel_constraint': constraints.serialize(self.kernel_constraint),
-                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
+                  'recurrent_constraint':
+                      constraints.serialize(self.recurrent_constraint),
                   'bias_constraint': constraints.serialize(self.bias_constraint),
                   'dropout': self.dropout,
                   'recurrent_dropout': self.recurrent_dropout}
@@ -1108,15 +1142,21 @@ def get_config(self):
         config = {'units': self.units,
                   'activation': activations.serialize(self.activation),
                   'use_bias': self.use_bias,
-                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
-                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
+                  'kernel_initializer':
+                      initializers.serialize(self.kernel_initializer),
+                  'recurrent_initializer':
+                      initializers.serialize(self.recurrent_initializer),
                   'bias_initializer': initializers.serialize(self.bias_initializer),
-                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
+                  'kernel_regularizer':
+                      regularizers.serialize(self.kernel_regularizer),
+                  'recurrent_regularizer':
+                      regularizers.serialize(self.recurrent_regularizer),
                   'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-                  'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+                  'activity_regularizer':
+                      regularizers.serialize(self.activity_regularizer),
                   'kernel_constraint': constraints.serialize(self.kernel_constraint),
-                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
+                  'recurrent_constraint':
+                      constraints.serialize(self.recurrent_constraint),
                   'bias_constraint': constraints.serialize(self.bias_constraint),
                   'dropout': self.dropout,
                   'recurrent_dropout': self.recurrent_dropout}
@@ -1231,6 +1271,7 @@ def __init__(self, units,
         self.implementation = implementation
         self.reset_after = reset_after
         self.state_size = self.units
+        self.output_size = self.units
         self._dropout_mask = None
         self._recurrent_dropout_mask = None
 
@@ -1291,7 +1332,8 @@ def build(self, input_shape):
             # bias for hidden state - just for compatibility with CuDNN
             if self.reset_after:
                 self.recurrent_bias_z = self.recurrent_bias[:self.units]
-                self.recurrent_bias_r = self.recurrent_bias[self.units: self.units * 2]
+                self.recurrent_bias_r = (
+                    self.recurrent_bias[self.units: self.units * 2])
                 self.recurrent_bias_h = self.recurrent_bias[self.units * 2:]
         else:
             self.input_bias_z = None
@@ -1423,16 +1465,22 @@ def call(self, inputs, states, training=None):
     def get_config(self):
         config = {'units': self.units,
                   'activation': activations.serialize(self.activation),
-                  'recurrent_activation': activations.serialize(self.recurrent_activation),
+                  'recurrent_activation':
+                      activations.serialize(self.recurrent_activation),
                   'use_bias': self.use_bias,
-                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
-                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
+                  'kernel_initializer':
+                      initializers.serialize(self.kernel_initializer),
+                  'recurrent_initializer':
+                      initializers.serialize(self.recurrent_initializer),
                   'bias_initializer': initializers.serialize(self.bias_initializer),
-                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
+                  'kernel_regularizer':
+                      regularizers.serialize(self.kernel_regularizer),
+                  'recurrent_regularizer':
+                      regularizers.serialize(self.recurrent_regularizer),
                   'bias_regularizer': regularizers.serialize(self.bias_regularizer),
                   'kernel_constraint': constraints.serialize(self.kernel_constraint),
-                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
+                  'recurrent_constraint':
+                      constraints.serialize(self.recurrent_constraint),
                   'bias_constraint': constraints.serialize(self.bias_constraint),
                   'dropout': self.dropout,
                   'recurrent_dropout': self.recurrent_dropout,
@@ -1529,10 +1577,14 @@ class GRU(RNN):
             True = "after" (CuDNN compatible).
 
     # References
-        - [Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation](https://arxiv.org/abs/1406.1078)
-        - [On the Properties of Neural Machine Translation: Encoder-Decoder Approaches](https://arxiv.org/abs/1409.1259)
-        - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling](http://arxiv.org/abs/1412.3555v1)
-        - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
+        - [Learning Phrase Representations using RNN Encoder-Decoder for
+           Statistical Machine Translation](https://arxiv.org/abs/1406.1078)
+        - [On the Properties of Neural Machine Translation:
+           Encoder-Decoder Approaches](https://arxiv.org/abs/1409.1259)
+        - [Empirical Evaluation of Gated Recurrent Neural Networks on
+           Sequence Modeling](https://arxiv.org/abs/1412.3555v1)
+        - [A Theoretically Grounded Application of Dropout in
+           Recurrent Neural Networks](https://arxiv.org/abs/1512.05287)
     """
 
     @interfaces.legacy_recurrent_support
@@ -1678,17 +1730,24 @@ def reset_after(self):
     def get_config(self):
         config = {'units': self.units,
                   'activation': activations.serialize(self.activation),
-                  'recurrent_activation': activations.serialize(self.recurrent_activation),
+                  'recurrent_activation':
+                      activations.serialize(self.recurrent_activation),
                   'use_bias': self.use_bias,
-                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
-                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
+                  'kernel_initializer':
+                      initializers.serialize(self.kernel_initializer),
+                  'recurrent_initializer':
+                      initializers.serialize(self.recurrent_initializer),
                   'bias_initializer': initializers.serialize(self.bias_initializer),
-                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
+                  'kernel_regularizer':
+                      regularizers.serialize(self.kernel_regularizer),
+                  'recurrent_regularizer':
+                      regularizers.serialize(self.recurrent_regularizer),
                   'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-                  'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+                  'activity_regularizer':
+                      regularizers.serialize(self.activity_regularizer),
                   'kernel_constraint': constraints.serialize(self.kernel_constraint),
-                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
+                  'recurrent_constraint':
+                      constraints.serialize(self.recurrent_constraint),
                   'bias_constraint': constraints.serialize(self.bias_constraint),
                   'dropout': self.dropout,
                   'recurrent_dropout': self.recurrent_dropout,
@@ -1734,7 +1793,8 @@ class LSTMCell(Layer):
         unit_forget_bias: Boolean.
             If True, add 1 to the bias of the forget gate at initialization.
             Setting it to true will also force `bias_initializer="zeros"`.
-            This is recommended in [Jozefowicz et al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+            This is recommended in [Jozefowicz et al. (2015)](
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
         kernel_regularizer: Regularizer function applied to
             the `kernel` weights matrix
             (see [regularizer](../regularizers.md)).
@@ -1806,6 +1866,7 @@ def __init__(self, units,
         self.recurrent_dropout = min(1., max(0., recurrent_dropout))
         self.implementation = implementation
         self.state_size = (self.units, self.units)
+        self.output_size = self.units
         self._dropout_mask = None
         self._recurrent_dropout_mask = None
 
@@ -1847,8 +1908,10 @@ def bias_initializer(_, *args, **kwargs):
         self.kernel_o = self.kernel[:, self.units * 3:]
 
         self.recurrent_kernel_i = self.recurrent_kernel[:, :self.units]
-        self.recurrent_kernel_f = self.recurrent_kernel[:, self.units: self.units * 2]
-        self.recurrent_kernel_c = self.recurrent_kernel[:, self.units * 2: self.units * 3]
+        self.recurrent_kernel_f = (
+            self.recurrent_kernel[:, self.units: self.units * 2])
+        self.recurrent_kernel_c = (
+            self.recurrent_kernel[:, self.units * 2: self.units * 3])
         self.recurrent_kernel_o = self.recurrent_kernel[:, self.units * 3:]
 
         if self.use_bias:
@@ -1954,17 +2017,23 @@ def call(self, inputs, states, training=None):
     def get_config(self):
         config = {'units': self.units,
                   'activation': activations.serialize(self.activation),
-                  'recurrent_activation': activations.serialize(self.recurrent_activation),
+                  'recurrent_activation':
+                      activations.serialize(self.recurrent_activation),
                   'use_bias': self.use_bias,
-                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
-                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
+                  'kernel_initializer':
+                      initializers.serialize(self.kernel_initializer),
+                  'recurrent_initializer':
+                      initializers.serialize(self.recurrent_initializer),
                   'bias_initializer': initializers.serialize(self.bias_initializer),
                   'unit_forget_bias': self.unit_forget_bias,
-                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
+                  'kernel_regularizer':
+                      regularizers.serialize(self.kernel_regularizer),
+                  'recurrent_regularizer':
+                      regularizers.serialize(self.recurrent_regularizer),
                   'bias_regularizer': regularizers.serialize(self.bias_regularizer),
                   'kernel_constraint': constraints.serialize(self.kernel_constraint),
-                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
+                  'recurrent_constraint':
+                      constraints.serialize(self.recurrent_constraint),
                   'bias_constraint': constraints.serialize(self.bias_constraint),
                   'dropout': self.dropout,
                   'recurrent_dropout': self.recurrent_dropout,
@@ -2002,7 +2071,8 @@ class LSTM(RNN):
         unit_forget_bias: Boolean.
             If True, add 1 to the bias of the forget gate at initialization.
             Setting it to true will also force `bias_initializer="zeros"`.
-            This is recommended in [Jozefowicz et al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+            This is recommended in [Jozefowicz et al. (2015)](
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
         kernel_regularizer: Regularizer function applied to
             the `kernel` weights matrix
             (see [regularizer](../regularizers.md)).
@@ -2052,10 +2122,14 @@ class LSTM(RNN):
             Unrolling is only suitable for short sequences.
 
     # References
-        - [Long short-term memory](http://www.bioinf.jku.at/publications/older/2604.pdf) (original 1997 paper)
-        - [Learning to forget: Continual prediction with LSTM](http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015)
-        - [Supervised sequence labeling with recurrent neural networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
-        - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
+        - [Long short-term memory](
+          http://www.bioinf.jku.at/publications/older/2604.pdf)
+        - [Learning to forget: Continual prediction with LSTM](
+          http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015)
+        - [Supervised sequence labeling with recurrent neural networks](
+          http://www.cs.toronto.edu/~graves/preprint.pdf)
+        - [A Theoretically Grounded Application of Dropout in
+           Recurrent Neural Networks](https://arxiv.org/abs/1512.05287)
     """
 
     @interfaces.legacy_recurrent_support
@@ -2201,18 +2275,25 @@ def implementation(self):
     def get_config(self):
         config = {'units': self.units,
                   'activation': activations.serialize(self.activation),
-                  'recurrent_activation': activations.serialize(self.recurrent_activation),
+                  'recurrent_activation':
+                      activations.serialize(self.recurrent_activation),
                   'use_bias': self.use_bias,
-                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
-                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
+                  'kernel_initializer':
+                      initializers.serialize(self.kernel_initializer),
+                  'recurrent_initializer':
+                      initializers.serialize(self.recurrent_initializer),
                   'bias_initializer': initializers.serialize(self.bias_initializer),
                   'unit_forget_bias': self.unit_forget_bias,
-                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
+                  'kernel_regularizer':
+                      regularizers.serialize(self.kernel_regularizer),
+                  'recurrent_regularizer':
+                      regularizers.serialize(self.recurrent_regularizer),
                   'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-                  'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+                  'activity_regularizer':
+                      regularizers.serialize(self.activity_regularizer),
                   'kernel_constraint': constraints.serialize(self.kernel_constraint),
-                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
+                  'recurrent_constraint':
+                      constraints.serialize(self.recurrent_constraint),
                   'bias_constraint': constraints.serialize(self.bias_constraint),
                   'dropout': self.dropout,
                   'recurrent_dropout': self.recurrent_dropout,
diff --git a/keras/layers/wrappers.py b/keras/layers/wrappers.py
index 8ac651ad15e..2d576dc57fc 100644
--- a/keras/layers/wrappers.py
+++ b/keras/layers/wrappers.py
@@ -276,7 +276,8 @@ def compute_mask(self, inputs, mask=None):
         If the output mask at each time step is not `None`:
         (E.g., inner layer is Masking or RNN)
         Concatenate all of them and return the concatenation.
-        If the output mask at each time step is `None` and the input mask is not `None`:
+        If the output mask at each time step is `None` and
+        the input mask is not `None`:
         (E.g., inner layer is Dense)
         Reduce the input_mask to 2 dimensions and return it.
         Otherwise (both the output mask and the input mask are `None`):
@@ -538,6 +539,9 @@ def call(self,
             output = y * y_rev
         elif self.merge_mode is None:
             output = [y, y_rev]
+        else:
+            raise ValueError('Unrecognized value for argument '
+                             'merge_mode: %s' % (self.merge_mode))
 
         # Properly set learning phase
         if (getattr(y, '_uses_learning_phase', False) or
diff --git a/keras/legacy/interfaces.py b/keras/legacy/interfaces.py
index f8ababcfc95..45a0e310cda 100644
--- a/keras/legacy/interfaces.py
+++ b/keras/legacy/interfaces.py
@@ -86,8 +86,8 @@ def wrapper(*args, **kwargs):
                     if i < len(kwargs) - 1:
                         signature += ', '
                 signature += ')`'
-                warnings.warn('Update your `' + object_name +
-                              '` call to the Keras 2 API: ' + signature, stacklevel=2)
+                warnings.warn('Update your `' + object_name + '` call to the ' +
+                              'Keras 2 API: ' + signature, stacklevel=2)
             return func(*args, **kwargs)
         wrapper._original_function = func
         return wrapper
@@ -127,7 +127,8 @@ def embedding_kwargs_preprocessor(args, kwargs):
         kwargs.pop('dropout')
         warnings.warn('The `dropout` argument is no longer support in `Embedding`. '
                       'You can apply a `keras.layers.SpatialDropout1D` layer '
-                      'right after the `Embedding` layer to get the same behavior.', stacklevel=3)
+                      'right after the `Embedding` layer to get the same behavior.',
+                      stacklevel=3)
     return args, kwargs, converted
 
 legacy_embedding_support = generate_legacy_interface(
@@ -269,7 +270,7 @@ def conv2d_args_preprocessor(args, kwargs):
     converted = []
     if len(args) > 4:
         raise TypeError('Layer can receive at most 3 positional arguments.')
-    if len(args) == 4:
+    elif len(args) == 4:
         if isinstance(args[2], int) and isinstance(args[3], int):
             new_keywords = ['padding', 'strides', 'data_format']
             for kwd in new_keywords:
@@ -378,7 +379,7 @@ def conv3d_args_preprocessor(args, kwargs):
     if len(args) > 5:
         raise TypeError('Layer can receive at most 4 positional arguments.')
     if len(args) == 5:
-        if isinstance(args[2], int) and isinstance(args[3], int) and isinstance(args[4], int):
+        if all([isinstance(x, int) for x in args[2:5]]):
             kernel_size = (args[2], args[3], args[4])
             args = [args[0], args[1], kernel_size]
             converted.append(('kernel_size', 'kernel_dim*'))
@@ -398,21 +399,21 @@ def conv3d_args_preprocessor(args, kwargs):
             args = [args[0], args[1], kernel_size]
             converted.append(('kernel_size', 'kernel_dim*'))
     elif len(args) == 3:
-        if 'kernel_dim2' in kwargs and 'kernel_dim3' in kwargs:
+        if all([x in kwargs for x in ['kernel_dim2', 'kernel_dim3']]):
             kernel_size = (args[2],
                            kwargs.pop('kernel_dim2'),
                            kwargs.pop('kernel_dim3'))
             args = [args[0], args[1], kernel_size]
             converted.append(('kernel_size', 'kernel_dim*'))
     elif len(args) == 2:
-        if 'kernel_dim1' in kwargs and 'kernel_dim2' in kwargs and 'kernel_dim3' in kwargs:
+        if all([x in kwargs for x in ['kernel_dim1', 'kernel_dim2', 'kernel_dim3']]):
             kernel_size = (kwargs.pop('kernel_dim1'),
                            kwargs.pop('kernel_dim2'),
                            kwargs.pop('kernel_dim3'))
             args = [args[0], args[1], kernel_size]
             converted.append(('kernel_size', 'kernel_dim*'))
     elif len(args) == 1:
-        if 'kernel_dim1' in kwargs and 'kernel_dim2' in kwargs and 'kernel_dim3' in kwargs:
+        if all([x in kwargs for x in ['kernel_dim1', 'kernel_dim2', 'kernel_dim3']]):
             kernel_size = (kwargs.pop('kernel_dim1'),
                            kwargs.pop('kernel_dim2'),
                            kwargs.pop('kernel_dim3'))
@@ -507,7 +508,8 @@ def zeropadding2d_args_preprocessor(args, kwargs):
             kwargs['padding'] = ((top_pad, bottom_pad), (left_pad, right_pad))
             warnings.warn('The `padding` argument in the Keras 2 API no longer'
                           'accepts dict types. You can now input argument as: '
-                          '`padding=(top_pad, bottom_pad, left_pad, right_pad)`.', stacklevel=3)
+                          '`padding=(top_pad, bottom_pad, left_pad, right_pad)`.',
+                          stacklevel=3)
     elif len(args) == 2 and isinstance(args[1], dict):
         if set(args[1].keys()) <= {'top_pad', 'bottom_pad',
                                    'left_pad', 'right_pad'}:
@@ -518,7 +520,8 @@ def zeropadding2d_args_preprocessor(args, kwargs):
             args = (args[0], ((top_pad, bottom_pad), (left_pad, right_pad)))
             warnings.warn('The `padding` argument in the Keras 2 API no longer'
                           'accepts dict types. You can now input argument as: '
-                          '`padding=((top_pad, bottom_pad), (left_pad, right_pad))`', stacklevel=3)
+                          '`padding=((top_pad, bottom_pad), (left_pad, right_pad))`',
+                          stacklevel=3)
     return args, kwargs, converted
 
 legacy_zeropadding2d_support = generate_legacy_interface(
@@ -583,7 +586,8 @@ def generator_methods_args_preprocessor(args, kwargs):
                 kwargs['steps_per_epoch'] = samples_per_epoch
             converted.append(('samples_per_epoch', 'steps_per_epoch'))
 
-    keras1_args = {'samples_per_epoch', 'val_samples', 'nb_epoch', 'nb_val_samples', 'nb_worker'}
+    keras1_args = {'samples_per_epoch', 'val_samples',
+                   'nb_epoch', 'nb_val_samples', 'nb_worker'}
     if keras1_args.intersection(kwargs.keys()):
         warnings.warn('The semantics of the Keras 2 argument '
                       '`steps_per_epoch` is not the same as the '
diff --git a/keras/legacy/layers.py b/keras/legacy/layers.py
index be869335bca..9716caf89e7 100644
--- a/keras/legacy/layers.py
+++ b/keras/legacy/layers.py
@@ -135,7 +135,8 @@ def get_config(self):
                   'nb_feature': self.nb_feature,
                   'W_regularizer': regularizers.serialize(self.W_regularizer),
                   'b_regularizer': regularizers.serialize(self.b_regularizer),
-                  'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+                  'activity_regularizer':
+                      regularizers.serialize(self.activity_regularizer),
                   'W_constraint': constraints.serialize(self.W_constraint),
                   'b_constraint': constraints.serialize(self.b_constraint),
                   'bias': self.bias,
@@ -269,7 +270,8 @@ def get_config(self):
                   'activation': activations.serialize(self.activation),
                   'W_regularizer': regularizers.serialize(self.W_regularizer),
                   'b_regularizer': regularizers.serialize(self.b_regularizer),
-                  'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+                  'activity_regularizer':
+                      regularizers.serialize(self.activity_regularizer),
                   'W_constraint': constraints.serialize(self.W_constraint),
                   'b_constraint': constraints.serialize(self.b_constraint),
                   'bias': self.bias,
@@ -485,7 +487,8 @@ def get_initial_state(self, inputs):
         initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
         initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
         initial_state = K.expand_dims(initial_state)  # (samples, 1)
-        initial_state = K.tile(initial_state, [1, self.units])  # (samples, output_dim)
+        # (samples, output_dim)
+        initial_state = K.tile(initial_state, [1, self.units])
         initial_state = [initial_state for _ in range(len(self.states))]
         return initial_state
 
@@ -497,7 +500,8 @@ def __call__(self, inputs, initial_state=None, **kwargs):
         # If there are multiple inputs, then
         # they should be the main input and `initial_state`
         # e.g. when loading model from file
-        if isinstance(inputs, (list, tuple)) and len(inputs) > 1 and initial_state is None:
+        if (isinstance(inputs, (list, tuple))
+                and len(inputs) > 1 and initial_state is None):
             initial_state = inputs[1:]
             inputs = inputs[0]
 
@@ -508,8 +512,7 @@ def __call__(self, inputs, initial_state=None, **kwargs):
         if initial_state is None:
             return super(Recurrent, self).__call__(inputs, **kwargs)
 
-        if not isinstance(initial_state, (list, tuple)):
-            initial_state = [initial_state]
+        initial_state = to_list(initial_state, allow_tuple=True)
 
         is_keras_tensor = hasattr(initial_state[0], '_keras_history')
         for tensor in initial_state:
@@ -602,10 +605,7 @@ def call(self, inputs, mask=None, training=None, initial_state=None):
             output = last_output
 
         if self.return_state:
-            if not isinstance(states, (list, tuple)):
-                states = [states]
-            else:
-                states = list(states)
+            states = to_list(states, allow_tuple=True)
             return [output] + states
         else:
             return output
@@ -633,8 +633,7 @@ def reset_states(self, states=None):
             for state in self.states:
                 K.set_value(state, np.zeros((batch_size, self.units)))
         else:
-            if not isinstance(states, (list, tuple)):
-                states = [states]
+            states = to_list(states, allow_tuple=True)
             if len(states) != len(self.states):
                 raise ValueError('Layer ' + self.name + ' expects ' +
                                  str(len(self.states)) + ' states, '
@@ -748,7 +747,8 @@ def __init__(self, filters,
         self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
         self.padding = conv_utils.normalize_padding(padding)
         self.data_format = K.normalize_data_format(data_format)
-        self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2, 'dilation_rate')
+        self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
+                                                        'dilation_rate')
         self.return_sequences = return_sequences
         self.go_backwards = go_backwards
         self.stateful = stateful
@@ -789,9 +789,10 @@ def compute_output_shape(self, input_shape):
 
         if self.return_state:
             if self.data_format == 'channels_first':
-                output_shape = [output_shape] + [(input_shape[0], self.filters, rows, cols) for _ in range(2)]
+                state_shape = (input_shape[0], self.filters, rows, cols)
             elif self.data_format == 'channels_last':
-                output_shape = [output_shape] + [(input_shape[0], rows, cols, self.filters) for _ in range(2)]
+                state_shape = (input_shape[0], rows, cols, self.filters)
+            output_shape = [output_shape, state_shape, state_shape]
 
         return output_shape
 
diff --git a/keras/metrics.py b/keras/metrics.py
index f8436dea31c..6a8415da66f 100644
--- a/keras/metrics.py
+++ b/keras/metrics.py
@@ -35,6 +35,7 @@ def categorical_accuracy(y_true, y_pred):
 
 
 def sparse_categorical_accuracy(y_true, y_pred):
+    # flatten y_true in case it's in shape (num_samples, 1) instead of (num_samples,)
     return K.cast(K.equal(K.flatten(y_true),
                           K.cast(K.argmax(y_pred, axis=-1), K.floatx())),
                   K.floatx())
@@ -51,7 +52,9 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
 
 
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
-    return K.mean(K.in_top_k(y_pred, K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
+    # If the shape of y_true is (num_samples, 1), flatten to (num_samples,)
+    return K.mean(K.in_top_k(y_pred, K.cast(K.flatten(y_true), 'int32'), k),
+                  axis=-1)
 
 
 # Aliases
diff --git a/keras/models.py b/keras/models.py
index d1231fb5d2d..e6e829827c4 100644
--- a/keras/models.py
+++ b/keras/models.py
@@ -68,8 +68,8 @@ def _clone_functional_model(model, input_tensors=None):
             # Cache newly created input layer.
             newly_created_input_layer = input_tensor._keras_history[0]
             layer_map[layer] = newly_created_input_layer
-        for original_input_layer, cloned_input_layer in zip(model._input_layers, input_layers):
-            layer_map[original_input_layer] = cloned_input_layer
+        for _original, _cloned in zip(model._input_layers, input_layers):
+            layer_map[_original] = _cloned
     else:
         # Make sure that all input tensors come from a Keras layer.
         # If tensor comes from an input layer: cache the input layer.
diff --git a/keras/optimizers.py b/keras/optimizers.py
index b783770f403..20ac031e310 100644
--- a/keras/optimizers.py
+++ b/keras/optimizers.py
@@ -234,7 +234,8 @@ class RMSprop(Optimizer):
         decay: float >= 0. Learning rate decay over each update.
 
     # References
-        - [rmsprop: Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+        - [rmsprop: Divide the gradient by a running average of its recent magnitude]
+          (http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
     """
 
     def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0.,
@@ -301,7 +302,8 @@ class Adagrad(Optimizer):
         decay: float >= 0. Learning rate decay over each update.
 
     # References
-        - [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+        - [Adaptive Subgradient Methods for Online Learning and Stochastic
+           Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
     """
 
     def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
@@ -371,7 +373,8 @@ class Adadelta(Optimizer):
         decay: float >= 0. Initial learning rate decay.
 
     # References
-        - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
+        - [Adadelta - an adaptive learning rate method]
+          (https://arxiv.org/abs/1212.5701)
     """
 
     def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0.,
@@ -446,8 +449,10 @@ class Adam(Optimizer):
             Beyond".
 
     # References
-        - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
-        - [On the Convergence of Adam and Beyond](https://openreview.net/forum?id=ryQu7f-RZ)
+        - [Adam - A Method for Stochastic Optimization]
+          (https://arxiv.org/abs/1412.6980v8)
+        - [On the Convergence of Adam and Beyond]
+          (https://openreview.net/forum?id=ryQu7f-RZ)
     """
 
     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
@@ -532,7 +537,8 @@ class Adamax(Optimizer):
         decay: float >= 0. Learning rate decay over each update.
 
     # References
-        - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
+        - [Adam - A Method for Stochastic Optimization]
+          (https://arxiv.org/abs/1412.6980v8)
     """
 
     def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
@@ -613,7 +619,8 @@ class Nadam(Optimizer):
 
     # References
         - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
-        - [On the importance of initialization and momentum in deep learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)
+        - [On the importance of initialization and momentum in deep learning]
+          (http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)
     """
 
     def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
@@ -638,10 +645,10 @@ def get_updates(self, loss, params):
         t = K.cast(self.iterations, K.floatx()) + 1
 
         # Due to the recommendations in [2], i.e. warming momentum schedule
-        momentum_cache_t = self.beta_1 * (
-            1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
-        momentum_cache_t_1 = self.beta_1 * (
-            1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
+        momentum_cache_t = self.beta_1 * (1. - 0.5 * (
+            K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
+        momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (
+            K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
         m_schedule_new = self.m_schedule * momentum_cache_t
         m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
         self.updates.append((self.m_schedule, m_schedule_new))
@@ -659,7 +666,8 @@ def get_updates(self, loss, params):
             m_t_prime = m_t / (1. - m_schedule_next)
             v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
             v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
-            m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
+            m_t_bar = (1. - momentum_cache_t) * g_prime + (
+                momentum_cache_t_1 * m_t_prime)
 
             self.updates.append(K.update(m, m_t))
             self.updates.append(K.update(v, v_t))
diff --git a/keras/preprocessing/image.py b/keras/preprocessing/image.py
index 8830d600607..b5708cdd56b 100644
--- a/keras/preprocessing/image.py
+++ b/keras/preprocessing/image.py
@@ -4,6 +4,10 @@
 from __future__ import division
 from __future__ import print_function
 
+import inspect
+
+from .. import backend
+from .. import utils
 from keras_preprocessing import image
 
 random_rotation = image.random_rotation
@@ -15,11 +19,453 @@
 apply_brightness_shift = image.apply_brightness_shift
 random_brightness = image.random_brightness
 apply_affine_transform = image.apply_affine_transform
-array_to_img = image.array_to_img
-img_to_array = image.img_to_array
-save_img = image.save_img
 load_img = image.load_img
-ImageDataGenerator = image.ImageDataGenerator
-Iterator = image.Iterator
-NumpyArrayIterator = image.NumpyArrayIterator
-DirectoryIterator = image.DirectoryIterator
+
+
+def array_to_img(x, data_format=None, scale=True, dtype=None):
+    if data_format is None:
+        data_format = backend.image_data_format()
+    if 'dtype' in inspect.getargspec(image.array_to_img).args:
+        if dtype is None:
+            dtype = backend.floatx()
+        return image.array_to_img(x,
+                                  data_format=data_format,
+                                  scale=scale,
+                                  dtype=dtype)
+    return image.array_to_img(x,
+                              data_format=data_format,
+                              scale=scale)
+
+
+def img_to_array(img, data_format=None, dtype=None):
+    if data_format is None:
+        data_format = backend.image_data_format()
+    if 'dtype' in inspect.getargspec(image.img_to_array).args:
+        if dtype is None:
+            dtype = backend.floatx()
+        return image.img_to_array(img, data_format=data_format, dtype=dtype)
+    return image.img_to_array(img, data_format=data_format)
+
+
+def save_img(path,
+             x,
+             data_format=None,
+             file_format=None,
+             scale=True, **kwargs):
+    if data_format is None:
+        data_format = backend.image_data_format()
+    return image.save_img(path,
+                          x,
+                          data_format=data_format,
+                          file_format=file_format,
+                          scale=scale, **kwargs)
+
+
+class Iterator(image.Iterator, utils.Sequence):
+    """Base class for image data iterators.
+
+    Every `Iterator` must implement the `_get_batches_of_transformed_samples`
+    method.
+
+    # Arguments
+        n: Integer, total number of samples in the dataset to loop over.
+        batch_size: Integer, size of a batch.
+        shuffle: Boolean, whether to shuffle the data between epochs.
+        seed: Random seeding for data shuffling.
+    """
+    pass
+
+
+class DirectoryIterator(image.DirectoryIterator, Iterator):
+    """Iterator capable of reading images from a directory on disk.
+
+    # Arguments
+        directory: Path to the directory to read images from.
+            Each subdirectory in this directory will be
+            considered to contain images from one class,
+            or alternatively you could specify class subdirectories
+            via the `classes` argument.
+        image_data_generator: Instance of `ImageDataGenerator`
+            to use for random transformations and normalization.
+        target_size: tuple of integers, dimensions to resize input images to.
+        color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`.
+            Color mode to read images.
+        classes: Optional list of strings, names of subdirectories
+            containing images from each class (e.g. `["dogs", "cats"]`).
+            It will be computed automatically if not set.
+        class_mode: Mode for yielding the targets:
+            `"binary"`: binary targets (if there are only two classes),
+            `"categorical"`: categorical targets,
+            `"sparse"`: integer targets,
+            `"input"`: targets are images identical to input images (mainly
+                used to work with autoencoders),
+            `None`: no targets get yielded (only input images are yielded).
+        batch_size: Integer, size of a batch.
+        shuffle: Boolean, whether to shuffle the data between epochs.
+        seed: Random seed for data shuffling.
+        data_format: String, one of `channels_first`, `channels_last`.
+        save_to_dir: Optional directory where to save the pictures
+            being yielded, in a viewable format. This is useful
+            for visualizing the random transformations being
+            applied, for debugging purposes.
+        save_prefix: String prefix to use for saving sample
+            images (if `save_to_dir` is set).
+        save_format: Format to use for saving sample images
+            (if `save_to_dir` is set).
+        subset: Subset of data (`"training"` or `"validation"`) if
+            validation_split is set in ImageDataGenerator.
+        interpolation: Interpolation method used to resample the image if the
+            target size is different from that of the loaded image.
+            Supported methods are "nearest", "bilinear", and "bicubic".
+            If PIL version 1.1.3 or newer is installed, "lanczos" is also
+            supported. If PIL version 3.4.0 or newer is installed, "box" and
+            "hamming" are also supported. By default, "nearest" is used.
+        dtype: Dtype to use for generated arrays.
+    """
+
+    def __init__(self, directory, image_data_generator,
+                 target_size=(256, 256),
+                 color_mode='rgb',
+                 classes=None,
+                 class_mode='categorical',
+                 batch_size=32,
+                 shuffle=True,
+                 seed=None,
+                 data_format=None,
+                 save_to_dir=None,
+                 save_prefix='',
+                 save_format='png',
+                 follow_links=False,
+                 subset=None,
+                 interpolation='nearest',
+                 dtype=None):
+        if data_format is None:
+            data_format = backend.image_data_format()
+        kwargs = {}
+        if 'dtype' in inspect.getargspec(
+                image.ImageDataGenerator.__init__).args:
+            if dtype is None:
+                dtype = backend.floatx()
+            kwargs['dtype'] = dtype
+        super(DirectoryIterator, self).__init__(
+            directory, image_data_generator,
+            target_size=target_size,
+            color_mode=color_mode,
+            classes=classes,
+            class_mode=class_mode,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            seed=seed,
+            data_format=data_format,
+            save_to_dir=save_to_dir,
+            save_prefix=save_prefix,
+            save_format=save_format,
+            follow_links=follow_links,
+            subset=subset,
+            interpolation=interpolation,
+            **kwargs)
+
+
+class NumpyArrayIterator(image.NumpyArrayIterator, Iterator):
+    """Iterator yielding data from a Numpy array.
+
+    # Arguments
+        x: Numpy array of input data or tuple.
+            If tuple, the second elements is either
+            another numpy array or a list of numpy arrays,
+            each of which gets passed
+            through as an output without any modifications.
+        y: Numpy array of targets data.
+        image_data_generator: Instance of `ImageDataGenerator`
+            to use for random transformations and normalization.
+        batch_size: Integer, size of a batch.
+        shuffle: Boolean, whether to shuffle the data between epochs.
+        sample_weight: Numpy array of sample weights.
+        seed: Random seed for data shuffling.
+        data_format: String, one of `channels_first`, `channels_last`.
+        save_to_dir: Optional directory where to save the pictures
+            being yielded, in a viewable format. This is useful
+            for visualizing the random transformations being
+            applied, for debugging purposes.
+        save_prefix: String prefix to use for saving sample
+            images (if `save_to_dir` is set).
+        save_format: Format to use for saving sample images
+            (if `save_to_dir` is set).
+        subset: Subset of data (`"training"` or `"validation"`) if
+            validation_split is set in ImageDataGenerator.
+        dtype: Dtype to use for the generated arrays.
+    """
+
+    def __init__(self, x, y, image_data_generator,
+                 batch_size=32,
+                 shuffle=False,
+                 sample_weight=None,
+                 seed=None,
+                 data_format=None,
+                 save_to_dir=None,
+                 save_prefix='',
+                 save_format='png',
+                 subset=None,
+                 dtype=None):
+        if data_format is None:
+            data_format = backend.image_data_format()
+        kwargs = {}
+        if 'dtype' in inspect.getargspec(
+                image.NumpyArrayIterator.__init__).args:
+            if dtype is None:
+                dtype = backend.floatx()
+            kwargs['dtype'] = dtype
+        super(NumpyArrayIterator, self).__init__(
+            x, y, image_data_generator,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            sample_weight=sample_weight,
+            seed=seed,
+            data_format=data_format,
+            save_to_dir=save_to_dir,
+            save_prefix=save_prefix,
+            save_format=save_format,
+            subset=subset,
+            **kwargs)
+
+
+class ImageDataGenerator(image.ImageDataGenerator):
+    """Generate batches of tensor image data with real-time data augmentation.
+     The data will be looped over (in batches).
+
+    # Arguments
+        featurewise_center: Boolean.
+            Set input mean to 0 over the dataset, feature-wise.
+        samplewise_center: Boolean. Set each sample mean to 0.
+        featurewise_std_normalization: Boolean.
+            Divide inputs by std of the dataset, feature-wise.
+        samplewise_std_normalization: Boolean. Divide each input by its std.
+        zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
+        zca_whitening: Boolean. Apply ZCA whitening.
+        rotation_range: Int. Degree range for random rotations.
+        width_shift_range: Float, 1-D array-like or int
+            - float: fraction of total width, if < 1, or pixels if >= 1.
+            - 1-D array-like: random elements from the array.
+            - int: integer number of pixels from interval
+                `(-width_shift_range, +width_shift_range)`
+            - With `width_shift_range=2` possible values
+                are integers `[-1, 0, +1]`,
+                same as with `width_shift_range=[-1, 0, +1]`,
+                while with `width_shift_range=1.0` possible values are floats
+                in the interval [-1.0, +1.0).
+        height_shift_range: Float, 1-D array-like or int
+            - float: fraction of total height, if < 1, or pixels if >= 1.
+            - 1-D array-like: random elements from the array.
+            - int: integer number of pixels from interval
+                `(-height_shift_range, +height_shift_range)`
+            - With `height_shift_range=2` possible values
+                are integers `[-1, 0, +1]`,
+                same as with `height_shift_range=[-1, 0, +1]`,
+                while with `height_shift_range=1.0` possible values are floats
+                in the interval [-1.0, +1.0).
+        brightness_range: Tuple or list of two floats. Range for picking
+            a brightness shift value from.
+        shear_range: Float. Shear Intensity
+            (Shear angle in counter-clockwise direction in degrees)
+        zoom_range: Float or [lower, upper]. Range for random zoom.
+            If a float, `[lower, upper] = [1-zoom_range, 1+zoom_range]`.
+        channel_shift_range: Float. Range for random channel shifts.
+        fill_mode: One of {"constant", "nearest", "reflect" or "wrap"}.
+            Default is 'nearest'.
+            Points outside the boundaries of the input are filled
+            according to the given mode:
+            - 'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
+            - 'nearest':  aaaaaaaa|abcd|dddddddd
+            - 'reflect':  abcddcba|abcd|dcbaabcd
+            - 'wrap':  abcdabcd|abcd|abcdabcd
+        cval: Float or Int.
+            Value used for points outside the boundaries
+            when `fill_mode = "constant"`.
+        horizontal_flip: Boolean. Randomly flip inputs horizontally.
+        vertical_flip: Boolean. Randomly flip inputs vertically.
+        rescale: rescaling factor. Defaults to None.
+            If None or 0, no rescaling is applied,
+            otherwise we multiply the data by the value provided
+            (after applying all other transformations).
+        preprocessing_function: function that will be implied on each input.
+            The function will run after the image is resized and augmented.
+            The function should take one argument:
+            one image (Numpy tensor with rank 3),
+            and should output a Numpy tensor with the same shape.
+        data_format: Image data format,
+            either "channels_first" or "channels_last".
+            "channels_last" mode means that the images should have shape
+            `(samples, height, width, channels)`,
+            "channels_first" mode means that the images should have shape
+            `(samples, channels, height, width)`.
+            It defaults to the `image_data_format` value found in your
+            Keras config file at `~/.keras/keras.json`.
+            If you never set it, then it will be "channels_last".
+        validation_split: Float. Fraction of images reserved for validation
+            (strictly between 0 and 1).
+        dtype: Dtype to use for the generated arrays.
+
+    # Examples
+    Example of using `.flow(x, y)`:
+
+    ```python
+    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
+    y_train = np_utils.to_categorical(y_train, num_classes)
+    y_test = np_utils.to_categorical(y_test, num_classes)
+
+    datagen = ImageDataGenerator(
+        featurewise_center=True,
+        featurewise_std_normalization=True,
+        rotation_range=20,
+        width_shift_range=0.2,
+        height_shift_range=0.2,
+        horizontal_flip=True)
+
+    # compute quantities required for featurewise normalization
+    # (std, mean, and principal components if ZCA whitening is applied)
+    datagen.fit(x_train)
+
+    # fits the model on batches with real-time data augmentation:
+    model.fit_generator(datagen.flow(x_train, y_train, batch_size=32),
+                        steps_per_epoch=len(x_train) / 32, epochs=epochs)
+
+    # here's a more "manual" example
+    for e in range(epochs):
+        print('Epoch', e)
+        batches = 0
+        for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
+            model.fit(x_batch, y_batch)
+            batches += 1
+            if batches >= len(x_train) / 32:
+                # we need to break the loop by hand because
+                # the generator loops indefinitely
+                break
+    ```
+    Example of using `.flow_from_directory(directory)`:
+
+    ```python
+    train_datagen = ImageDataGenerator(
+            rescale=1./255,
+            shear_range=0.2,
+            zoom_range=0.2,
+            horizontal_flip=True)
+
+    test_datagen = ImageDataGenerator(rescale=1./255)
+
+    train_generator = train_datagen.flow_from_directory(
+            'data/train',
+            target_size=(150, 150),
+            batch_size=32,
+            class_mode='binary')
+
+    validation_generator = test_datagen.flow_from_directory(
+            'data/validation',
+            target_size=(150, 150),
+            batch_size=32,
+            class_mode='binary')
+
+    model.fit_generator(
+            train_generator,
+            steps_per_epoch=2000,
+            epochs=50,
+            validation_data=validation_generator,
+            validation_steps=800)
+    ```
+
+    Example of transforming images and masks together.
+
+    ```python
+    # we create two instances with the same arguments
+    data_gen_args = dict(featurewise_center=True,
+                         featurewise_std_normalization=True,
+                         rotation_range=90,
+                         width_shift_range=0.1,
+                         height_shift_range=0.1,
+                         zoom_range=0.2)
+    image_datagen = ImageDataGenerator(**data_gen_args)
+    mask_datagen = ImageDataGenerator(**data_gen_args)
+
+    # Provide the same seed and keyword arguments to the fit and flow methods
+    seed = 1
+    image_datagen.fit(images, augment=True, seed=seed)
+    mask_datagen.fit(masks, augment=True, seed=seed)
+
+    image_generator = image_datagen.flow_from_directory(
+        'data/images',
+        class_mode=None,
+        seed=seed)
+
+    mask_generator = mask_datagen.flow_from_directory(
+        'data/masks',
+        class_mode=None,
+        seed=seed)
+
+    # combine generators into one which yields image and masks
+    train_generator = zip(image_generator, mask_generator)
+
+    model.fit_generator(
+        train_generator,
+        steps_per_epoch=2000,
+        epochs=50)
+    ```
+    """
+
+    def __init__(self,
+                 featurewise_center=False,
+                 samplewise_center=False,
+                 featurewise_std_normalization=False,
+                 samplewise_std_normalization=False,
+                 zca_whitening=False,
+                 zca_epsilon=1e-6,
+                 rotation_range=0,
+                 width_shift_range=0.,
+                 height_shift_range=0.,
+                 brightness_range=None,
+                 shear_range=0.,
+                 zoom_range=0.,
+                 channel_shift_range=0.,
+                 fill_mode='nearest',
+                 cval=0.,
+                 horizontal_flip=False,
+                 vertical_flip=False,
+                 rescale=None,
+                 preprocessing_function=None,
+                 data_format=None,
+                 validation_split=0.0,
+                 dtype=None):
+        if data_format is None:
+            data_format = backend.image_data_format()
+        kwargs = {}
+        if 'dtype' in inspect.getargspec(
+                image.ImageDataGenerator.__init__).args:
+            if dtype is None:
+                dtype = backend.floatx()
+            kwargs['dtype'] = dtype
+        super(ImageDataGenerator, self).__init__(
+            featurewise_center=featurewise_center,
+            samplewise_center=samplewise_center,
+            featurewise_std_normalization=featurewise_std_normalization,
+            samplewise_std_normalization=samplewise_std_normalization,
+            zca_whitening=zca_whitening,
+            zca_epsilon=zca_epsilon,
+            rotation_range=rotation_range,
+            width_shift_range=width_shift_range,
+            height_shift_range=height_shift_range,
+            brightness_range=brightness_range,
+            shear_range=shear_range,
+            zoom_range=zoom_range,
+            channel_shift_range=channel_shift_range,
+            fill_mode=fill_mode,
+            cval=cval,
+            horizontal_flip=horizontal_flip,
+            vertical_flip=vertical_flip,
+            rescale=rescale,
+            preprocessing_function=preprocessing_function,
+            data_format=data_format,
+            validation_split=validation_split,
+            **kwargs)
+
+
+array_to_img.__doc__ = image.array_to_img.__doc__
+img_to_array.__doc__ = image.img_to_array.__doc__
+save_img.__doc__ = image.save_img.__doc__
diff --git a/keras/preprocessing/sequence.py b/keras/preprocessing/sequence.py
index b5e5b6c6699..c7ede3e3830 100644
--- a/keras/preprocessing/sequence.py
+++ b/keras/preprocessing/sequence.py
@@ -5,9 +5,74 @@
 from __future__ import print_function
 
 from keras_preprocessing import sequence
+from .. import utils
 
 pad_sequences = sequence.pad_sequences
 make_sampling_table = sequence.make_sampling_table
 skipgrams = sequence.skipgrams
 _remove_long_seq = sequence._remove_long_seq  # TODO: make it public?
-TimeseriesGenerator = sequence.TimeseriesGenerator
+
+
+class TimeseriesGenerator(sequence.TimeseriesGenerator, utils.Sequence):
+    """Utility class for generating batches of temporal data.
+
+    This class takes in a sequence of data-points gathered at
+    equal intervals, along with time series parameters such as
+    stride, length of history, etc., to produce batches for
+    training/validation.
+
+    # Arguments
+        data: Indexable generator (such as list or Numpy array)
+            containing consecutive data points (timesteps).
+            The data should be at 2D, and axis 0 is expected
+            to be the time dimension.
+        targets: Targets corresponding to timesteps in `data`.
+            It should have same length as `data`.
+        length: Length of the output sequences (in number of timesteps).
+        sampling_rate: Period between successive individual timesteps
+            within sequences. For rate `r`, timesteps
+            `data[i]`, `data[i-r]`, ... `data[i - length]`
+            are used for create a sample sequence.
+        stride: Period between successive output sequences.
+            For stride `s`, consecutive output samples would
+            be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
+        start_index: Data points earlier than `start_index` will not be used
+            in the output sequences. This is useful to reserve part of the
+            data for test or validation.
+        end_index: Data points later than `end_index` will not be used
+            in the output sequences. This is useful to reserve part of the
+            data for test or validation.
+        shuffle: Whether to shuffle output samples,
+            or instead draw them in chronological order.
+        reverse: Boolean: if `true`, timesteps in each output sample will be
+            in reverse chronological order.
+        batch_size: Number of timeseries samples in each batch
+            (except maybe the last one).
+
+    # Returns
+        A [Sequence](/utils/#sequence) instance.
+
+    # Examples
+
+    ```python
+    from keras.preprocessing.sequence import TimeseriesGenerator
+    import numpy as np
+
+    data = np.array([[i] for i in range(50)])
+    targets = np.array([[i] for i in range(50)])
+
+    data_gen = TimeseriesGenerator(data, targets,
+                                   length=10, sampling_rate=2,
+                                   batch_size=2)
+    assert len(data_gen) == 20
+
+    batch_0 = data_gen[0]
+    x, y = batch_0
+    assert np.array_equal(x,
+                          np.array([[[0], [2], [4], [6], [8]],
+                                    [[1], [3], [5], [7], [9]]]))
+    assert np.array_equal(y,
+                          np.array([[10], [11]]))
+    ```
+    """
+    pass
diff --git a/keras/utils/__init__.py b/keras/utils/__init__.py
index 664db66427d..79e3179ed67 100644
--- a/keras/utils/__init__.py
+++ b/keras/utils/__init__.py
@@ -7,6 +7,7 @@
 
 # Globally-importable utils.
 from .io_utils import HDF5Matrix
+from .io_utils import h5dict
 from .data_utils import get_file
 from .data_utils import Sequence
 from .data_utils import GeneratorEnqueuer
diff --git a/keras/utils/conv_utils.py b/keras/utils/conv_utils.py
index c4370dbf352..d8c65ff3012 100644
--- a/keras/utils/conv_utils.py
+++ b/keras/utils/conv_utils.py
@@ -135,7 +135,8 @@ def conv_input_length(output_length, filter_size, padding, stride):
     return (output_length - 1) * stride - 2 * pad + filter_size
 
 
-def deconv_length(dim_size, stride_size, kernel_size, padding, output_padding):
+def deconv_length(dim_size, stride_size, kernel_size, padding,
+                  output_padding, dilation=1):
     """Determines output length of a transposed convolution given input length.
 
     # Arguments
@@ -146,6 +147,7 @@ def deconv_length(dim_size, stride_size, kernel_size, padding, output_padding):
         padding: One of `"same"`, `"valid"`, `"full"`.
         output_padding: Integer, amount of padding along the output dimension,
             Can be set to `None` in which case the output length is inferred.
+        dilation: dilation rate, integer.
 
     # Returns
         The output length (integer).
@@ -154,6 +156,9 @@ def deconv_length(dim_size, stride_size, kernel_size, padding, output_padding):
     if dim_size is None:
         return None
 
+    # Get the dilated kernel size
+    kernel_size = kernel_size + (kernel_size - 1) * (dilation - 1)
+
     # Infer length if output padding is None, else compute the exact length
     if output_padding is None:
         if padding == 'valid':
diff --git a/keras/utils/data_utils.py b/keras/utils/data_utils.py
index a288c6a939e..6f5d248e059 100644
--- a/keras/utils/data_utils.py
+++ b/keras/utils/data_utils.py
@@ -13,7 +13,7 @@
 import tarfile
 import threading
 import time
-import traceback
+import warnings
 import zipfile
 from abc import abstractmethod
 from contextlib import closing
@@ -222,10 +222,10 @@ def dl_progress(count, block_size, total_size):
         try:
             try:
                 urlretrieve(origin, fpath, dl_progress)
-            except URLError as e:
-                raise Exception(error_msg.format(origin, e.errno, e.reason))
             except HTTPError as e:
                 raise Exception(error_msg.format(origin, e.code, e.msg))
+            except URLError as e:
+                raise Exception(error_msg.format(origin, e.errno, e.reason))
         except (Exception, KeyboardInterrupt):
             if os.path.exists(fpath):
                 os.remove(fpath)
@@ -380,10 +380,9 @@ def on_epoch_end(self):
         pass
 
     def __iter__(self):
-        """Create an infinite generator that iterate over the Sequence."""
-        while True:
-            for item in (self[i] for i in range(len(self))):
-                yield item
+        """Create a generator that iterate over the Sequence."""
+        for item in (self[i] for i in range(len(self))):
+            yield item
 
 
 # Global variables to be shared across processes
@@ -435,60 +434,8 @@ class SequenceEnqueuer(object):
     The `enqueuer.get()` should be an infinite stream of datas.
 
     """
-
-    @abstractmethod
-    def is_running(self):
-        raise NotImplementedError
-
-    @abstractmethod
-    def start(self, workers=1, max_queue_size=10):
-        """Starts the handler's workers.
-
-        # Arguments
-            workers: number of worker threads
-            max_queue_size: queue size
-                (when full, threads could block on `put()`).
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def stop(self, timeout=None):
-        """Stop running threads and wait for them to exit, if necessary.
-
-        Should be called by the same thread which called start().
-
-        # Arguments
-            timeout: maximum time to wait on thread.join()
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get(self):
-        """Creates a generator to extract data from the queue.
-
-        Skip the data if it is `None`.
-
-        # Returns
-            Generator yielding tuples `(inputs, targets)`
-                or `(inputs, targets, sample_weights)`.
-        """
-        raise NotImplementedError
-
-
-class OrderedEnqueuer(SequenceEnqueuer):
-    """Builds a Enqueuer from a Sequence.
-
-    Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
-
-    # Arguments
-        sequence: A `keras.utils.data_utils.Sequence` object.
-        use_multiprocessing: use multiprocessing if True, otherwise threading
-        shuffle: whether to shuffle the data at the beginning of each epoch
-    """
-
     def __init__(self, sequence,
-                 use_multiprocessing=False,
-                 shuffle=False):
+                 use_multiprocessing=False):
         self.sequence = sequence
         self.use_multiprocessing = use_multiprocessing
 
@@ -511,7 +458,6 @@ def __init__(self, sequence,
                 self.uid = _SEQUENCE_COUNTER.value
                 _SEQUENCE_COUNTER.value += 1
 
-        self.shuffle = shuffle
         self.workers = 0
         self.executor_fn = None
         self.queue = None
@@ -530,9 +476,7 @@ def start(self, workers=1, max_queue_size=10):
                 (when full, workers could block on `put()`)
         """
         if self.use_multiprocessing:
-            self.executor_fn = lambda seqs: mp.Pool(workers,
-                                                    initializer=init_pool,
-                                                    initargs=(seqs,))
+            self.executor_fn = self._get_executor_init(workers)
         else:
             # We do not need the init since it's threads.
             self.executor_fn = lambda _: ThreadPool(workers)
@@ -543,6 +487,78 @@ def start(self, workers=1, max_queue_size=10):
         self.run_thread.daemon = True
         self.run_thread.start()
 
+    def _send_sequence(self):
+        """Send current Iterable to all workers."""
+        # For new processes that may spawn
+        _SHARED_SEQUENCES[self.uid] = self.sequence
+
+    def stop(self, timeout=None):
+        """Stops running threads and wait for them to exit, if necessary.
+
+        Should be called by the same thread which called `start()`.
+
+        # Arguments
+            timeout: maximum time to wait on `thread.join()`
+        """
+        self.stop_signal.set()
+        with self.queue.mutex:
+            self.queue.queue.clear()
+            self.queue.unfinished_tasks = 0
+            self.queue.not_full.notify()
+        self.run_thread.join(timeout)
+        _SHARED_SEQUENCES[self.uid] = None
+
+    @abstractmethod
+    def _run(self):
+        """Submits request to the executor and queue the `Future` objects."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def _get_executor_init(self, workers):
+        """Get the Pool initializer for multiprocessing.
+
+        # Returns
+            Function, a Function to initialize the pool
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get(self):
+        """Creates a generator to extract data from the queue.
+
+        Skip the data if it is `None`.
+
+        # Returns
+            Generator yielding tuples `(inputs, targets)`
+                or `(inputs, targets, sample_weights)`.
+        """
+        raise NotImplementedError
+
+
+class OrderedEnqueuer(SequenceEnqueuer):
+    """Builds a Enqueuer from a Sequence.
+
+    Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
+
+    # Arguments
+        sequence: A `keras.utils.data_utils.Sequence` object.
+        use_multiprocessing: use multiprocessing if True, otherwise threading
+        shuffle: whether to shuffle the data at the beginning of each epoch
+    """
+    def __init__(self, sequence, use_multiprocessing=False, shuffle=False):
+        super(OrderedEnqueuer, self).__init__(sequence, use_multiprocessing)
+        self.shuffle = shuffle
+
+    def _get_executor_init(self, workers):
+        """Get the Pool initializer for multiprocessing.
+
+        # Returns
+            Function, a Function to initialize the pool
+        """
+        return lambda seqs: mp.Pool(workers,
+                                    initializer=init_pool,
+                                    initargs=(seqs,))
+
     def _wait_queue(self):
         """Wait for the queue to be empty."""
         while True:
@@ -594,28 +610,32 @@ def get(self):
                     yield inputs
         except Exception as e:
             self.stop()
-            six.raise_from(StopIteration(e), e)
+            six.reraise(*sys.exc_info())
 
-    def _send_sequence(self):
-        """Send current Sequence to all workers."""
-        # For new processes that may spawn
-        _SHARED_SEQUENCES[self.uid] = self.sequence
 
-    def stop(self, timeout=None):
-        """Stops running threads and wait for them to exit, if necessary.
+def init_pool_generator(gens, random_seed=None):
+    global _SHARED_SEQUENCES
+    _SHARED_SEQUENCES = gens
 
-        Should be called by the same thread which called `start()`.
+    if random_seed is not None:
+        ident = mp.current_process().ident
+        np.random.seed(random_seed + ident)
 
-        # Arguments
-            timeout: maximum time to wait on `thread.join()`
-        """
-        self.stop_signal.set()
-        with self.queue.mutex:
-            self.queue.queue.clear()
-            self.queue.unfinished_tasks = 0
-            self.queue.not_full.notify()
-        self.run_thread.join(timeout)
-        _SHARED_SEQUENCES[self.uid] = None
+
+def next_sample(uid):
+    """Get the next value from the generator `uid`.
+
+    To allow multiple generators to be used at the same time, we use `uid` to
+    get a specific one. A single generator would cause the validation to
+    overwrite the training generator.
+
+    # Arguments
+        uid: int, generator identifier
+
+    # Returns
+        The next value of generator `uid`.
+    """
+    return six.next(_SHARED_SEQUENCES[uid])
 
 
 class GeneratorEnqueuer(SequenceEnqueuer):
@@ -634,145 +654,33 @@ class GeneratorEnqueuer(SequenceEnqueuer):
             will be incremented by one for each worker.
     """
 
-    def __init__(self, generator,
-                 use_multiprocessing=False,
-                 wait_time=0.05,
-                 seed=None):
-        self.wait_time = wait_time
-        self._generator = generator
-        if os.name is 'nt' and use_multiprocessing is True:
-            # On Windows, avoid **SYSTEMATIC** error in `multiprocessing`:
-            # `TypeError: can't pickle generator objects`
-            # => Suggest multithreading instead of multiprocessing on Windows
-            raise ValueError('Using a generator with `use_multiprocessing=True`'
-                             ' is not supported on Windows (no marshalling of'
-                             ' generators across process boundaries). Instead,'
-                             ' use single thread/process or multithreading.')
-        else:
-            self._use_multiprocessing = use_multiprocessing
-        self._threads = []
-        self._stop_event = None
-        self._manager = None
-        self.queue = None
-        self.seed = seed
-
-    def _data_generator_task(self):
-        if self._use_multiprocessing is False:
-            while not self._stop_event.is_set():
-                with self.genlock:
-                    try:
-                        if (self.queue is not None and
-                                self.queue.qsize() < self.max_queue_size):
-                            # On all OSes, avoid **SYSTEMATIC** error
-                            # in multithreading mode:
-                            # `ValueError: generator already executing`
-                            # => Serialize calls to
-                            # infinite iterator/generator's next() function
-                            generator_output = next(self._generator)
-                            self.queue.put((True, generator_output))
-                        else:
-                            time.sleep(self.wait_time)
-                    except StopIteration:
-                        break
-                    except Exception as e:
-                        # Can't pickle tracebacks.
-                        # As a compromise, print the traceback and
-                        # pickle None instead.
-                        if not hasattr(e, '__traceback__'):
-                            setattr(e, '__traceback__', sys.exc_info()[2])
-                        self.queue.put((False, e))
-                        self._stop_event.set()
-                        break
-        else:
-            while not self._stop_event.is_set():
-                try:
-                    if (self.queue is not None and
-                            self.queue.qsize() < self.max_queue_size):
-                        generator_output = next(self._generator)
-                        self.queue.put((True, generator_output))
-                    else:
-                        time.sleep(self.wait_time)
-                except StopIteration:
-                    break
-                except Exception as e:
-                    # Can't pickle tracebacks.
-                    # As a compromise, print the traceback and pickle None instead.
-                    traceback.print_exc()
-                    setattr(e, '__traceback__', None)
-                    self.queue.put((False, e))
-                    self._stop_event.set()
-                    break
+    def __init__(self, sequence, use_multiprocessing=False, wait_time=None,
+                 random_seed=None):
+        super(GeneratorEnqueuer, self).__init__(sequence, use_multiprocessing)
+        self.random_seed = random_seed
+        if wait_time is not None:
+            warnings.warn('`wait_time` is not used anymore.',
+                          DeprecationWarning)
 
-    def start(self, workers=1, max_queue_size=10):
-        """Kicks off threads which add data from the generator into the queue.
+    def _get_executor_init(self, workers):
+        """Get the Pool initializer for multiprocessing.
 
-        # Arguments
-            workers: number of worker threads
-            max_queue_size: queue size
-                (when full, threads could block on `put()`)
-        """
-        try:
-            self.max_queue_size = max_queue_size
-            if self._use_multiprocessing:
-                self._manager = mp.Manager()
-                self.queue = self._manager.Queue(maxsize=max_queue_size)
-                self._stop_event = mp.Event()
-            else:
-                # On all OSes, avoid **SYSTEMATIC** error in multithreading mode:
-                # `ValueError: generator already executing`
-                # => Serialize calls to infinite iterator/generator's next() function
-                self.genlock = threading.Lock()
-                self.queue = queue.Queue(maxsize=max_queue_size)
-                self._stop_event = threading.Event()
-
-            for _ in range(workers):
-                if self._use_multiprocessing:
-                    # Reset random seed else all children processes
-                    # share the same seed
-                    np.random.seed(self.seed)
-                    thread = mp.Process(target=self._data_generator_task)
-                    thread.daemon = True
-                    if self.seed is not None:
-                        self.seed += 1
-                else:
-                    thread = threading.Thread(target=self._data_generator_task)
-                self._threads.append(thread)
-                thread.start()
-        except:
-            self.stop()
-            raise
-
-    def is_running(self):
-        return self._stop_event is not None and not self._stop_event.is_set()
-
-    def stop(self, timeout=None):
-        """Stops running threads and wait for them to exit, if necessary.
-
-        Should be called by the same thread which called `start()`.
-
-        # Arguments
-            timeout: maximum time to wait on `thread.join()`.
+        # Returns
+            Function, a Function to initialize the pool
         """
-        if self.is_running():
-            self._stop_event.set()
-
-        for thread in self._threads:
-            if self._use_multiprocessing:
-                if thread.is_alive():
-                    thread.terminate()
-            else:
-                # The thread.is_alive() test is subject to a race condition:
-                # the thread could terminate right after the test and before the
-                # join, rendering this test meaningless -> Call thread.join()
-                # always, which is ok no matter what the status of the thread.
-                thread.join(timeout)
+        return lambda seqs: mp.Pool(workers,
+                                    initializer=init_pool_generator,
+                                    initargs=(seqs, self.random_seed))
 
-        if self._manager:
-            self._manager.shutdown()
-
-        self._threads = []
-        self._stop_event = None
-        self.queue = None
+    def _run(self):
+        """Submits request to the executor and queue the `Future` objects."""
+        self._send_sequence()  # Share the initial generator
+        with closing(self.executor_fn(_SHARED_SEQUENCES)) as executor:
+            while True:
+                if self.stop_signal.is_set():
+                    return
+                self.queue.put(
+                    executor.apply_async(next_sample, (self.uid,)), block=True)
 
     def get(self):
         """Creates a generator to extract data from the queue.
@@ -784,25 +692,30 @@ def get(self):
             `(inputs, targets)` or
             `(inputs, targets, sample_weights)`.
         """
-        while self.is_running():
-            if not self.queue.empty():
-                success, value = self.queue.get()
-                # Rethrow any exceptions found in the queue
-                if not success:
-                    six.reraise(value.__class__, value, value.__traceback__)
-                # Yield regular values
-                if value is not None:
-                    yield value
-            else:
-                all_finished = all([not thread.is_alive()
-                                    for thread in self._threads])
-                if all_finished and self.queue.empty():
-                    raise StopIteration()
-                else:
-                    time.sleep(self.wait_time)
-
-        # Make sure to rethrow the first exception in the queue, if any
-        while not self.queue.empty():
-            success, value = self.queue.get()
-            if not success:
-                six.reraise(value.__class__, value, value.__traceback__)
+        try:
+            while self.is_running():
+                inputs = self.queue.get(block=True).get()
+                self.queue.task_done()
+                if inputs is not None:
+                    yield inputs
+        except StopIteration:
+            # Special case for finite generators
+            last_ones = []
+            while self.queue.qsize() > 0:
+                last_ones.append(self.queue.get(block=True))
+            # Wait for them to complete
+            list(map(lambda f: f.wait(), last_ones))
+            # Keep the good ones
+            last_ones = [future.get() for future in last_ones if future.successful()]
+            for inputs in last_ones:
+                if inputs is not None:
+                    yield inputs
+        except Exception as e:
+            self.stop()
+            if 'generator already executing' in str(e):
+                raise RuntimeError(
+                    "Your generator is NOT thread-safe."
+                    "Keras requires a thread-safe generator when"
+                    "`use_multiprocessing=False, workers > 1`."
+                    "For more information see issue #1638.")
+            six.reraise(*sys.exc_info())
diff --git a/keras/utils/generic_utils.py b/keras/utils/generic_utils.py
index 864dbbaba1e..9ea10a2b972 100644
--- a/keras/utils/generic_utils.py
+++ b/keras/utils/generic_utils.py
@@ -444,7 +444,7 @@ def add(self, n, values=None):
         self.update(self._seen_so_far + n, values)
 
 
-def to_list(x):
+def to_list(x, allow_tuple=False):
     """Normalizes a list/tensor into a list.
 
     If a tensor is passed, we return
@@ -452,12 +452,18 @@ def to_list(x):
 
     # Arguments
         x: target object to be normalized.
+        allow_tuple: If False and x is a tuple,
+            it will be converted into a list
+            with a single element (the tuple).
+            Else converts the tuple to a list.
 
     # Returns
         A list.
     """
     if isinstance(x, list):
         return x
+    if allow_tuple and isinstance(x, tuple):
+        return list(x)
     return [x]
 
 
@@ -483,10 +489,7 @@ def object_list_uid(object_list):
 
 
 def is_all_none(iterable_or_element):
-    if not isinstance(iterable_or_element, (list, tuple)):
-        iterable = [iterable_or_element]
-    else:
-        iterable = iterable_or_element
+    iterable = to_list(iterable_or_element, allow_tuple=True)
     for element in iterable:
         if element is not None:
             return False
diff --git a/keras/utils/io_utils.py b/keras/utils/io_utils.py
index 0128b2fd9d6..d7b175a8f75 100644
--- a/keras/utils/io_utils.py
+++ b/keras/utils/io_utils.py
@@ -5,14 +5,23 @@
 
 import numpy as np
 from collections import defaultdict
+import sys
+
 
 import six
 try:
     import h5py
+    HDF5_OBJECT_HEADER_LIMIT = 64512
 except ImportError:
     h5py = None
 
 
+if sys.version_info[0] == 3:
+    import pickle
+else:
+    import cPickle as pickle
+
+
 class HDF5Matrix(object):
     """Representation of HDF5 dataset to be used instead of a Numpy array.
 
@@ -155,3 +164,188 @@ def ask_to_proceed_with_overwrite(filepath):
         return False
     print('[TIP] Next time specify overwrite=True!')
     return True
+
+
+class H5Dict(object):
+    """ A dict-like wrapper around h5py groups (or dicts).
+
+    This allows us to have a single serialization logic
+    for both pickling and saving to disk.
+
+    Note: This is not intended to be a generic wrapper.
+    There are lot of edge cases which have been hardcoded,
+    and makes sense only in the context of model serialization/
+    deserialization.
+    """
+
+    def __init__(self, path, mode='a'):
+        if isinstance(path, h5py.Group):
+            self.data = path
+            self._is_file = False
+        elif isinstance(path, str):
+            self.data = h5py.File(path, mode=mode)
+            self._is_file = True
+        elif isinstance(path, dict):
+            self.data = path
+            self._is_file = False
+            if mode == 'w':
+                self.data.clear()
+            # Flag to check if a dict is user defined data or a sub group:
+            self.data['_is_group'] = True
+        else:
+            raise TypeError('Required Group, str or dict. '
+                            'Received: {}.'.format(type(path)))
+        self.read_only = mode == 'r'
+
+    def __setitem__(self, attr, val):
+        if self.read_only:
+            raise ValueError('Cannot set item in read only mode.')
+        is_np = type(val).__module__ == np.__name__
+        if isinstance(self.data, dict):
+            if isinstance(attr, bytes):
+                attr = attr.decode('utf-8')
+            if is_np:
+                self.data[attr] = pickle.dumps(val)
+                # We have to remember to unpickle in __getitem__
+                self.data['_{}_pickled'.format(attr)] = True
+            else:
+                self.data[attr] = val
+            return
+        if isinstance(self.data, h5py.Group) and attr in self.data:
+            raise KeyError('Cannot set attribute. '
+                           'Group with name "{}" exists.'.format(attr))
+        if is_np:
+            dataset = self.data.create_dataset(attr, val.shape, dtype=val.dtype)
+            if not val.shape:
+                # scalar
+                dataset[()] = val
+            else:
+                dataset[:] = val
+        elif isinstance(val, list):
+            # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
+            # because in that case even chunking the array would not make the saving
+            # possible.
+            bad_attributes = [x for x in val if len(x) > HDF5_OBJECT_HEADER_LIMIT]
+
+            # Expecting this to never be true.
+            if len(bad_attributes) > 0:
+                raise RuntimeError('The following attributes cannot be saved to '
+                                   'HDF5 file because they are larger than '
+                                   '%d bytes: %s' % (HDF5_OBJECT_HEADER_LIMIT,
+                                                     ', '.join(bad_attributes)))
+
+            if val and sys.version_info[0] == 3 and isinstance(val[0], str):
+                # convert to bytes
+                val = [x.encode('utf-8') for x in val]
+
+            data_npy = np.asarray(val)
+
+            num_chunks = 1
+            chunked_data = np.array_split(data_npy, num_chunks)
+
+            # This will never loop forever thanks to the test above.
+            is_too_big = lambda x: x.nbytes > HDF5_OBJECT_HEADER_LIMIT
+            while any(map(is_too_big, chunked_data)):
+                num_chunks += 1
+                chunked_data = np.array_split(data_npy, num_chunks)
+
+            if num_chunks > 1:
+                for chunk_id, chunk_data in enumerate(chunked_data):
+                    self.data.attrs['%s%d' % (attr, chunk_id)] = chunk_data
+            else:
+                self.data.attrs[attr] = val
+        else:
+            self.data.attrs[attr] = val
+
+    def __getitem__(self, attr):
+        if isinstance(self.data, dict):
+            if isinstance(attr, bytes):
+                attr = attr.decode('utf-8')
+            if attr in self.data:
+                val = self.data[attr]
+                if isinstance(val, dict) and val.get('_is_group'):
+                    val = H5Dict(val)
+                elif '_{}_pickled'.format(attr) in self.data:
+                    val = pickle.loads(val)
+                return val
+            else:
+                if self.read_only:
+                    raise ValueError('Cannot create group in read only mode.')
+                val = {'_is_group': True}
+                self.data[attr] = val
+                return H5Dict(val)
+        if attr in self.data.attrs:
+            val = self.data.attrs[attr]
+            if type(val).__module__ == np.__name__:
+                if val.dtype.type == np.string_:
+                    val = val.tolist()
+        elif attr in self.data:
+            val = self.data[attr]
+            if isinstance(val, h5py.Dataset):
+                val = np.asarray(val)
+            else:
+                val = H5Dict(val)
+        else:
+            # could be chunked
+            chunk_attr = '%s%d' % (attr, 0)
+            is_chunked = chunk_attr in self.data.attrs
+            if is_chunked:
+                val = []
+                chunk_id = 0
+                while chunk_attr in self.data.attrs:
+                    chunk = self.data.attrs[chunk_attr]
+                    val.extend([x.decode('utf8') for x in chunk])
+                    chunk_id += 1
+                    chunk_attr = '%s%d' % (attr, chunk_id)
+            else:
+                if self.read_only:
+                    raise ValueError('Cannot create group in read only mode.')
+                val = H5Dict(self.data.create_group(attr))
+        return val
+
+    def __len__(self):
+        return len(self.data)
+
+    def __iter__(self):
+        return iter(self.data)
+
+    def iter(self):
+        return iter(self.data)
+
+    def __getattr__(self, attr):
+
+        def wrapper(f):
+            def h5wrapper(*args, **kwargs):
+                out = f(*args, **kwargs)
+                if isinstance(self.data, type(out)):
+                    return H5Dict(out)
+                else:
+                    return out
+            return h5wrapper
+
+        return wrapper(getattr(self.data, attr))
+
+    def close(self):
+        if isinstance(self.data, h5py.Group):
+            self.data.file.flush()
+            if self._is_file:
+                self.data.close()
+
+    def update(self, *args):
+        if isinstance(self.data, dict):
+            self.data.update(*args)
+        raise NotImplementedError
+
+    def __contains__(self, key):
+        if isinstance(self.data, dict):
+            return key in self.data
+        else:
+            return (key in self.data) or (key in self.data.attrs)
+
+    def get(self, key, default=None):
+        if key in self:
+            return self[key]
+        return default
+
+
+h5dict = H5Dict
diff --git a/keras/utils/multi_gpu_utils.py b/keras/utils/multi_gpu_utils.py
index 4c2374753eb..5c49b1af8f9 100644
--- a/keras/utils/multi_gpu_utils.py
+++ b/keras/utils/multi_gpu_utils.py
@@ -59,7 +59,9 @@ def multi_gpu_model(model, gpus=None, cpu_merge=True, cpu_relocation=False):
         A Keras `Model` instance which can be used just like the initial
         `model` argument, but which distributes its workload on multiple GPUs.
 
-    # Example 1 - Training models with weights merge on CPU
+    # Examples
+
+    Example 1 - Training models with weights merge on CPU
 
     ```python
         import tensorflow as tf
@@ -100,7 +102,7 @@ def multi_gpu_model(model, gpus=None, cpu_merge=True, cpu_relocation=False):
         model.save('my_model.h5')
     ```
 
-    # Example 2 - Training models with weights merge on CPU using cpu_relocation
+    Example 2 - Training models with weights merge on CPU using cpu_relocation
 
     ```python
          ..
@@ -108,16 +110,16 @@ def multi_gpu_model(model, gpus=None, cpu_merge=True, cpu_relocation=False):
          model = Xception(weights=None, ..)
 
          try:
-             model = multi_gpu_model(model, cpu_relocation=True)
+             parallel_model = multi_gpu_model(model, cpu_relocation=True)
              print("Training using multiple GPUs..")
-         except:
+         except ValueError:
+             parallel_model = model
              print("Training using single GPU or CPU..")
-
-         model.compile(..)
+         parallel_model.compile(..)
          ..
     ```
 
-    # Example 3 - Training models with weights merge on GPU (recommended for NV-link)
+    Example 3 - Training models with weights merge on GPU (recommended for NV-link)
 
     ```python
          ..
@@ -125,12 +127,13 @@ def multi_gpu_model(model, gpus=None, cpu_merge=True, cpu_relocation=False):
          model = Xception(weights=None, ..)
 
          try:
-             model = multi_gpu_model(model, cpu_merge=False)
+             parallel_model = multi_gpu_model(model, cpu_merge=False)
              print("Training using multiple GPUs..")
          except:
+             parallel_model = model
              print("Training using single GPU or CPU..")
 
-         model.compile(..)
+         parallel_model.compile(..)
          ..
     ```
 
@@ -241,10 +244,25 @@ def get_slice(data, i, parts):
                 for o in range(len(outputs)):
                     all_outputs[o].append(outputs[o])
 
+    # Deduplicate output names to handle Siamese networks.
+    occurrences = {}
+    for n in model.output_names:
+        if n not in occurrences:
+            occurrences[n] = 1
+        else:
+            occurrences[n] += 1
+    conflict_counter = {n: 0 for n, count in occurrences.items() if count > 1}
+    output_names = []
+    for n in model.output_names:
+        if n in conflict_counter:
+            conflict_counter[n] += 1
+            n += '_%d' % conflict_counter[n]
+        output_names.append(n)
+
     # Merge outputs under expected scope.
     with tf.device('/cpu:0' if cpu_merge else '/gpu:%d' % target_gpu_ids[0]):
         merged = []
-        for name, outputs in zip(model.output_names, all_outputs):
+        for name, outputs in zip(output_names, all_outputs):
             merged.append(concatenate(outputs,
                                       axis=0, name=name))
         return Model(model.inputs, merged)
diff --git a/keras/utils/test_utils.py b/keras/utils/test_utils.py
index 880b1605ecc..fdda30e6066 100644
--- a/keras/utils/test_utils.py
+++ b/keras/utils/test_utils.py
@@ -5,12 +5,9 @@
 
 import numpy as np
 from numpy.testing import assert_allclose
-import six
 
 from .generic_utils import has_arg
 from ..engine import Model, Input
-from ..models import Sequential
-from ..models import model_from_json
 from .. import backend as K
 
 
@@ -102,9 +99,11 @@ def _layer_in_model_test(model):
             _output = recovered_model.predict(input_data)
             assert_allclose(_output, actual_output, rtol=1e-3)
 
-        # test training mode (e.g. useful for dropout tests)
-        model.compile('rmsprop', 'mse')
-        model.train_on_batch(input_data, actual_output)
+        # test training mode (e.g. useful when the layer has a
+        # different behavior at training and testing time).
+        if has_arg(layer.call, 'training'):
+            model.compile('rmsprop', 'mse')
+            model.train_on_batch(input_data, actual_output)
         return actual_output
 
     # test in functional API
@@ -117,35 +116,12 @@ def _layer_in_model_test(model):
 
     # check with the functional API
     model = Model(x, y)
-    _layer_in_model_test(model)
+    actual_output = _layer_in_model_test(model)
 
-    # test as first layer in Sequential API
+    # test instantiation from layer config
     layer_config = layer.get_config()
     layer_config['batch_input_shape'] = input_shape
     layer = layer.__class__.from_config(layer_config)
 
-    # check with the sequential API
-    model = Sequential()
-    model.add(layer)
-    actual_output = _layer_in_model_test(model)
-
     # for further checks in the caller function
     return actual_output
-
-
-def keras_test(func):
-    """Function wrapper to clean up after TensorFlow tests.
-
-    # Arguments
-        func: test function to clean up after.
-
-    # Returns
-        A function wrapping the input function.
-    """
-    @six.wraps(func)
-    def wrapper(*args, **kwargs):
-        output = func(*args, **kwargs)
-        if K.backend() == 'tensorflow' or K.backend() == 'cntk':
-            K.clear_session()
-        return output
-    return wrapper
diff --git a/keras_mxnet_ci/nightly-buildspec.yml b/keras_mxnet_ci/nightly-buildspec-python2.yml
similarity index 83%
rename from keras_mxnet_ci/nightly-buildspec.yml
rename to keras_mxnet_ci/nightly-buildspec-python2.yml
index 0a6f3972490..05388b79e61 100644
--- a/keras_mxnet_ci/nightly-buildspec.yml
+++ b/keras_mxnet_ci/nightly-buildspec-python2.yml
@@ -9,7 +9,7 @@ phases:
       echo "Installing MXNet";
       apt-get update;
       pip install --upgrade pip;
-      pip install mxnet-mkl --pre;
+      pip install mxnet --pre;
       echo "Installing Tensorflow";
       pip install tensorflow;
       echo "Installing Theano";
@@ -21,13 +21,10 @@ phases:
       pip install nose;
       echo "Installing Keras from source";
       pip install -e .[visualize,tests];
-      pip uninstall --yes keras;
-
 
   build:
     commands:
       echo "Running PEP tests";
       py.test --pep8 -m pep8 -n0;
       echo "Running Keras Unit Tests and Integration Tests for all the backends";
-      py.test tests/ --ignore=tests/keras/utils/;
-      py.test tests/keras/utils/;
\ No newline at end of file
+      python -m pytest tests/;
\ No newline at end of file
diff --git a/keras_mxnet_ci/nightly-buildspec-python3.yml b/keras_mxnet_ci/nightly-buildspec-python3.yml
new file mode 100644
index 00000000000..c3a4ee28330
--- /dev/null
+++ b/keras_mxnet_ci/nightly-buildspec-python3.yml
@@ -0,0 +1,30 @@
+version: 0.2
+
+phases:
+  install:
+    commands:
+      echo "Checking out master branch";
+      git fetch;
+      git checkout master;
+      echo "Installing MXNet";
+      apt-get update;
+      sudo apt install -y python3-pip;
+      pip3 install mxnet --pre;
+      echo "Installing Tensorflow";
+      pip3 install tensorflow;
+      echo "Installing Theano";
+      pip3 install theano;
+      pip3 install pillow;
+      sudo apt-get -y install graphviz;
+      pip3 install --upgrade graphviz;
+      pip3 install pydot;
+      pip3 install nose;
+      echo "Installing Keras from source";
+      pip3 install -e .[visualize,tests];
+
+  build:
+    commands:
+      echo "Running PEP tests";
+      py.test --pep8 -m pep8 -n0;
+      echo "Running Keras Unit Tests and Integration Tests for all the backends";
+      python3 -m pytest tests/;
\ No newline at end of file
diff --git a/keras_mxnet_ci/pr-buildspec.yml b/keras_mxnet_ci/pr-buildspec-python2.yml
similarity index 85%
rename from keras_mxnet_ci/pr-buildspec.yml
rename to keras_mxnet_ci/pr-buildspec-python2.yml
index 91e512f44a2..c48d65c693e 100644
--- a/keras_mxnet_ci/pr-buildspec.yml
+++ b/keras_mxnet_ci/pr-buildspec-python2.yml
@@ -11,7 +11,7 @@ phases:
       echo "Installing MXNet";
       apt-get update;
       pip install --upgrade pip;
-      pip install mxnet-mkl --pre;
+      pip install mxnet --pre;
       echo "Installing Tensorflow";
       pip install tensorflow;
       echo "Installing Theano";
@@ -23,12 +23,10 @@ phases:
       pip install nose;
       echo "Installing Keras from source";
       pip install -e .[visualize,tests];
-      pip uninstall --yes keras;
 
   build:
     commands:
       echo "Running PEP tests";
       py.test --pep8 -m pep8 -n0;
       echo "Running Keras Unit Tests and Integration Tests for all the backends";
-      py.test tests/ --ignore=tests/keras/utils/;
-      py.test tests/keras/utils/;
+      python -m pytest tests/;
\ No newline at end of file
diff --git a/keras_mxnet_ci/pr-buildspec-python3.yml b/keras_mxnet_ci/pr-buildspec-python3.yml
new file mode 100644
index 00000000000..d3c97fa14de
--- /dev/null
+++ b/keras_mxnet_ci/pr-buildspec-python3.yml
@@ -0,0 +1,32 @@
+version: 0.2
+
+phases:
+  install:
+    commands:
+      echo $CODEBUILD_SOURCE_VERSION;
+      PRID=$(echo $CODEBUILD_SOURCE_VERSION | sed "s/pr/pull/g");
+      echo "Checking out $PRID";
+      git fetch origin $PRID/head:pr_test;
+      git checkout pr_test;
+      echo "Installing MXNet";
+      apt-get update;
+      sudo apt install -y python3-pip;
+      pip3 install mxnet --pre;
+      echo "Installing Tensorflow";
+      pip3 install tensorflow;
+      echo "Installing Theano";
+      pip3 install theano;
+      pip3 install pillow;
+      sudo apt-get -y install graphviz;
+      pip3 install --upgrade graphviz;
+      pip3 install pydot;
+      pip3 install nose;
+      echo "Installing Keras from source";
+      pip3 install -e .[visualize,tests];
+
+  build:
+    commands:
+      echo "Running PEP tests";
+      py.test --pep8 -m pep8 -n0;
+      echo "Running Keras Unit Tests and Integration Tests for all the backends";
+      python3 -m pytest tests/;
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
index 907acc45202..19a189509b8 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -7,76 +7,26 @@ addopts=-v
 # Do not run tests in the build folder
 norecursedirs= build
 
+# Running all tests should take less than 12 minutes.
+# Otherwise, something went wrong.
+timeout = 720
+
 # PEP-8 The following are ignored:
 # E501 line too long (82 > 79 characters)
 # E402 module level import not at top of file - temporary measure to continue adding ros python packaged in sys.path
 # E731 do not assign a lambda expression, use a def
+# W503 line break occurred before a binary operator
 
 pep8ignore=* E402 \
            * E731 \
-           examples/conv_filter_visualization.py E501 \
-           examples/deep_dream.py E501 \
-           examples/image_ocr.py E501 \
-           examples/imdb_fasttext.py E501 \
-           examples/imdb_lstm.py E501 \
-           examples/lstm_text_generation.py E501 \
-           examples/mnist_hierarchical_rnn.py E501 \
-           examples/mnist_net2net.py E501 \
-           examples/mnist_siamese.py E501 \
-           examples/mnist_tfrecord.py E501 \
-           examples/neural_doodle.py E501 \
-           examples/neural_style_transfer.py E501 \
-           keras/callbacks.py E501 \
-           keras/constraints.py E501 \
-           keras/metrics.py E501 \
-           keras/models.py E501 \
-           keras/optimizers.py E501 \
+           * W503 \
            keras/backend/cntk_backend.py E501 \
            keras/backend/common.py E501 \
+           keras/callbacks.py E501 \
+           keras/layers/embeddings.py E501 \
            keras/backend/tensorflow_backend.py E501 \
            keras/backend/theano_backend.py E501 \
-           keras/datasets/boston_housing.py E501 \
-           keras/datasets/imdb.py E501 \
-           keras/datasets/reuters.py E501 \
-           keras/engine/network.py E501 \
-           keras/engine/saving.py E501 \
-           keras/engine/training.py E501 \
-           keras/engine/training_generator.py E501 \
-           keras/layers/advanced_activations.py E501 \
-           keras/layers/convolutional.py E501 \
-           keras/layers/convolutional_recurrent.py E501 \
-           keras/layers/core.py E501 \
-           keras/layers/cudnn_recurrent.py E501 \
-           keras/layers/embeddings.py E501 \
-           keras/layers/local.py E501 \
-           keras/layers/merge.py E501 \
-           keras/layers/noise.py E501 \
-           keras/layers/normalization.py E501 \
-           keras/layers/recurrent.py E501 \
-           keras/layers/wrappers.py E501 \
-           keras/legacy/interfaces.py E501 \
-           keras/legacy/layers.py E501 \
-           tests/test_documentation.py E501 \
-           tests/test_loss_weighting.py E501 \
-           tests/test_model_saving.py E501 \
-           tests/integration_tests/test_temporal_data_tasks.py E501 \
-           tests/keras/initializers_test.py E501 \
-           tests/keras/metrics_test.py E501 \
-           tests/keras/optimizers_test.py E501 \
-           tests/keras/test_callbacks.py E501 \
-           tests/keras/test_sequential_model.py E501 \
-           tests/keras/backend/backend_test.py E501 \
-           tests/keras/backend/reference_operations.py E501 \
-           tests/keras/engine/test_topology.py E501 \
-           tests/keras/engine/test_training.py E501 \
-           tests/keras/layers/convolutional_recurrent_test.py E501 \
-           tests/keras/layers/convolutional_test.py E501 \
-           tests/keras/layers/core_test.py E501 \
-           tests/keras/layers/cudnn_recurrent_test.py E501 \
-           tests/keras/layers/embeddings_test.py E501 \
-           tests/keras/layers/normalization_test.py E501 \
-           tests/keras/layers/wrappers_test.py E501 \
-           tests/keras/legacy/interface_test.py E501
+           tests/keras/backend/backend_test.py E501
 
 # Enable line length testing with maximum line length of 120
 pep8maxlinelength = 120
diff --git a/setup.py b/setup.py
index 7e81fb231b0..f47898ae917 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@
 '''
 
 setup(name='keras-mxnet',
-      version='2.2.2',
+      version='2.2.4',
       description='Deep Learning for humans. Keras with highly scalable,\
                    high performance Apache MXNet backend support.',
       long_description=long_description,
@@ -36,16 +36,17 @@
       install_requires=['numpy>=1.9.1',
                         'scipy>=0.14',
                         'six>=1.9.0',
-                        'h5py>=2.7.1',
                         'pyyaml',
-                        'keras_applications==1.0.4',
-                        'keras_preprocessing==1.0.1'],
+                        'h5py',
+                        'keras_applications>=1.0.6',
+                        'keras_preprocessing>=1.0.5'],
       extras_require={
           'visualize': ['pydot>=1.2.4'],
           'tests': ['pytest',
                     'pytest-pep8',
                     'pytest-xdist',
                     'pytest-cov',
+                    'pytest-timeout',
                     'pandas',
                     'requests'],
       },
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000000..e33674b8e88
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,13 @@
+import pytest
+from keras import backend as K
+
+
+@pytest.fixture(autouse=True)
+def clear_session_after_test():
+    """Test wrapper to clean up after TensorFlow and CNTK tests.
+
+    This wrapper runs for all the tests in the keras test suite.
+    """
+    yield
+    if K.backend() == 'tensorflow' or K.backend() == 'cntk':
+        K.clear_session()
diff --git a/tests/integration_tests/applications_test.py b/tests/integration_tests/applications_test.py
index fa57c4ab619..8cac2b54d9f 100644
--- a/tests/integration_tests/applications_test.py
+++ b/tests/integration_tests/applications_test.py
@@ -2,9 +2,6 @@
 import random
 import os
 from multiprocessing import Process, Queue
-from keras.utils.test_utils import keras_test
-from keras.utils.test_utils import layer_test
-from keras.models import Sequential
 from keras import applications
 from keras import backend as K
 
@@ -27,10 +24,9 @@
     (applications.MobileNetV2, 1280),
     (applications.DenseNet121, 1024),
     (applications.DenseNet169, 1664),
-    (applications.DenseNet201, 1920)
-    # TODO: enable nasnet tests if they support Theano and CNTK
-    # (applications.NASNetMobile, 1056),
-    # (applications.NASNetLarge, 4032)
+    (applications.DenseNet201, 1920),
+    # Note that NASNetLarge is too heavy to test on Travis.
+    (applications.NASNetMobile, 1056)
 ]
 
 
@@ -61,7 +57,6 @@ def target(queue):
         return model.output_shape
 
 
-@keras_test
 def _test_application_basic(app, last_dim=1000):
     output_shape = _get_output_shape(lambda: app(weights=None))
     assert output_shape == (None, last_dim)
@@ -69,13 +64,19 @@ def _test_application_basic(app, last_dim=1000):
 
 @pytest.mark.skipif((K.backend() == 'mxnet'),
                     reason='MXNet backend requires input shape for convolution')
-@keras_test
 def _test_application_notop(app, last_dim):
     output_shape = _get_output_shape(
         lambda: app(weights=None, include_top=False))
     assert output_shape == (None, None, None, last_dim)
 
 
+def test_mobilenet_v2_legacy_import():
+    from keras.applications import mobilenetv2
+    assert hasattr(mobilenetv2, 'MobileNetV2')
+    from keras.applications import mobilenet_v2
+    assert hasattr(mobilenet_v2, 'MobileNetV2')
+
+
 def test_applications():
     for _ in range(3):
         app, last_dim = random.choice(MODEL_LIST)
diff --git a/tests/keras/preprocessing/image_test.py b/tests/integration_tests/preprocessing/image_test.py
similarity index 96%
rename from tests/keras/preprocessing/image_test.py
rename to tests/integration_tests/preprocessing/image_test.py
index 31064408613..cf3146db565 100644
--- a/tests/keras/preprocessing/image_test.py
+++ b/tests/integration_tests/preprocessing/image_test.py
@@ -168,30 +168,6 @@ def test_image_data_generator(self, tmpdir):
             x2, y2 = seq[0]
             assert list(y) != list(y2)
 
-    def test_image_data_generator_with_validation_split(self):
-        for test_images in self.all_test_images:
-            img_list = []
-            for im in test_images:
-                img_list.append(image.img_to_array(im)[None, ...])
-
-            images = np.vstack(img_list)
-            generator = image.ImageDataGenerator(validation_split=0.5)
-            seq = generator.flow(images, np.arange(images.shape[0]),
-                                 shuffle=False, batch_size=3,
-                                 subset='validation')
-            x, y = seq[0]
-            assert list(y) == [0, 1, 2]
-            seq = generator.flow(images, np.arange(images.shape[0]),
-                                 shuffle=False, batch_size=3,
-                                 subset='training')
-            x2, y2 = seq[0]
-            assert list(y2) == [4, 5, 6]
-
-            with pytest.raises(ValueError):
-                generator.flow(images, np.arange(images.shape[0]),
-                               shuffle=False, batch_size=3,
-                               subset='foo')
-
     def test_image_data_generator_with_split_value_error(self):
         with pytest.raises(ValueError):
             generator = image.ImageDataGenerator(validation_split=5)
diff --git a/tests/keras/preprocessing/sequence_test.py b/tests/integration_tests/preprocessing/sequence_test.py
similarity index 100%
rename from tests/keras/preprocessing/sequence_test.py
rename to tests/integration_tests/preprocessing/sequence_test.py
diff --git a/tests/keras/preprocessing/text_test.py b/tests/integration_tests/preprocessing/text_test.py
similarity index 100%
rename from tests/keras/preprocessing/text_test.py
rename to tests/integration_tests/preprocessing/text_test.py
diff --git a/tests/integration_tests/test_image_data_tasks.py b/tests/integration_tests/test_image_data_tasks.py
index 1896a043d18..87d3ad6db2f 100644
--- a/tests/integration_tests/test_image_data_tasks.py
+++ b/tests/integration_tests/test_image_data_tasks.py
@@ -2,7 +2,7 @@
 import numpy as np
 import pytest
 
-from keras.utils.test_utils import get_test_data, keras_test
+from keras.utils.test_utils import get_test_data
 from keras.models import Sequential
 from keras import layers
 from keras.utils.np_utils import to_categorical
@@ -12,7 +12,6 @@
                                 reason='MXNet backend does not support Pooling2d with SAME mode yet.')
 
 
-@keras_test
 def test_image_classification():
     np.random.seed(1337)
     input_shape = (16, 16, 3)
diff --git a/tests/integration_tests/test_temporal_data_tasks.py b/tests/integration_tests/test_temporal_data_tasks.py
index 15f2768ac22..2895bafe622 100644
--- a/tests/integration_tests/test_temporal_data_tasks.py
+++ b/tests/integration_tests/test_temporal_data_tasks.py
@@ -3,7 +3,7 @@
 import pytest
 import string
 
-from keras.utils.test_utils import get_test_data, keras_test
+from keras.utils.test_utils import get_test_data
 from keras.utils.np_utils import to_categorical
 from keras.models import Sequential
 from keras import layers, optimizers
@@ -14,7 +14,6 @@
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support unroll=False '
                            'in RNN yet.')
-@keras_test
 def test_temporal_classification():
     '''
     Classify temporal sequences of float numbers
@@ -50,7 +49,6 @@ def test_temporal_classification():
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support unroll=False '
                            'in RNN yet.')
-@keras_test
 def test_temporal_classification_functional():
     '''
     Classify temporal sequences of float numbers
@@ -83,7 +81,6 @@ def test_temporal_classification_functional():
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support unroll=False '
                            'in RNN yet.')
-@keras_test
 def test_temporal_regression():
     '''
     Predict float numbers (regression) based on sequences
@@ -104,7 +101,6 @@ def test_temporal_regression():
     assert(history.history['loss'][-1] < 1.)
 
 
-@keras_test
 def test_3d_to_3d():
     '''
     Apply a same Dense layer for each element of time dimension of the input
@@ -121,7 +117,7 @@ def test_3d_to_3d():
 
     model = Sequential()
     model.add(layers.TimeDistributed(
-        layers.Dense(y_train.shape[-1]), input_shape=(x_train.shape[1], x_train.shape[2])))
+        layers.Dense(y_train.shape[-1]), input_shape=x_train.shape[1:3]))
     model.compile(loss='hinge', optimizer='rmsprop')
     history = model.fit(x_train, y_train, epochs=20, batch_size=16,
                         validation_data=(x_test, y_test), verbose=0)
@@ -131,21 +127,24 @@ def test_3d_to_3d():
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support unroll=False '
                            'in RNN yet.')
-@keras_test
 def test_stacked_lstm_char_prediction():
     '''
     Learn alphabetical char sequence with stacked LSTM.
     Predict the whole alphabet based on the first two letters ('ab' -> 'ab...z')
     See non-toy example in examples/lstm_text_generation.py
     '''
-    # generate alphabet: http://stackoverflow.com/questions/16060899/alphabet-range-python
+    # generate alphabet:
+    # http://stackoverflow.com/questions/16060899/alphabet-range-python
     alphabet = string.ascii_lowercase
     number_of_chars = len(alphabet)
 
-    # generate char sequences of length 'sequence_length' out of alphabet and store the next char as label (e.g. 'ab'->'c')
+    # generate char sequences of length 'sequence_length' out of alphabet and
+    # store the next char as label (e.g. 'ab'->'c')
     sequence_length = 2
-    sentences = [alphabet[i: i + sequence_length] for i in range(len(alphabet) - sequence_length)]
-    next_chars = [alphabet[i + sequence_length] for i in range(len(alphabet) - sequence_length)]
+    sentences = [alphabet[i: i + sequence_length]
+                 for i in range(len(alphabet) - sequence_length)]
+    next_chars = [alphabet[i + sequence_length]
+                  for i in range(len(alphabet) - sequence_length)]
 
     # Transform sequences and labels into 'one-hot' encoding
     x = np.zeros((len(sentences), sequence_length, number_of_chars), dtype=np.bool)
@@ -157,7 +156,8 @@ def test_stacked_lstm_char_prediction():
 
     # learn the alphabet with stacked LSTM
     model = Sequential([
-        layers.LSTM(16, return_sequences=True, input_shape=(sequence_length, number_of_chars)),
+        layers.LSTM(16, return_sequences=True,
+                    input_shape=(sequence_length, number_of_chars)),
         layers.LSTM(16, return_sequences=False),
         layers.Dense(number_of_chars, activation='softmax')
     ])
@@ -180,7 +180,6 @@ def test_stacked_lstm_char_prediction():
     assert(generated == alphabet)
 
 
-@keras_test
 def test_masked_temporal():
     '''
     Confirm that even with masking on both inputs and outputs, cross-entropies are
@@ -193,20 +192,22 @@ def test_masked_temporal():
     The ground-truth best cross-entropy loss should, then be -log(0.5) = 0.69
 
     '''
+    np.random.seed(1338)
+
     model = Sequential()
     model.add(layers.Embedding(10, 10, mask_zero=True))
     model.add(layers.Activation('softmax'))
     model.compile(loss='categorical_crossentropy',
                   optimizer='adam')
 
-    x = np.random.random_integers(1, 9, (20000, 10))
+    x = np.random.randint(1, 10, size=(20000, 10))
     for rowi in range(x.shape[0]):
-        padding = np.random.random_integers(x.shape[1] / 2)
+        padding = np.random.randint(0, x.shape[1] / 2 + 1)
         x[rowi, :padding] = 0
 
     # 50% of the time the correct output is the input.
     # The other 50% of the time it's 2 * input % 10
-    y = (x * np.random.random_integers(1, 2, x.shape)) % 10
+    y = (x * np.random.randint(1, 3, size=x.shape)) % 10
     ys = np.zeros((y.size, 10), dtype='int32')
     for i, target in enumerate(y.flat):
         ys[i, target] = 1
@@ -220,7 +221,6 @@ def test_masked_temporal():
 
 @pytest.mark.skipif(K.backend() != 'tensorflow' and K.backend() != 'mxnet',
                     reason='Requires TensorFlow or MXNet backend')
-@keras_test
 def test_embedding_with_clipnorm():
     model = Sequential()
     model.add(layers.Embedding(input_dim=1, output_dim=1))
diff --git a/tests/integration_tests/test_tensorflow_integration.py b/tests/integration_tests/test_tensorflow_integration.py
new file mode 100644
index 00000000000..cc91cb0d5c5
--- /dev/null
+++ b/tests/integration_tests/test_tensorflow_integration.py
@@ -0,0 +1,50 @@
+from __future__ import print_function
+
+import os
+import tempfile
+import pytest
+import keras
+from keras import layers
+from keras.utils.test_utils import get_test_data
+
+
+@pytest.mark.skipif(keras.backend.backend() != 'tensorflow',
+                    reason='Requires TF backend')
+def test_tf_optimizer():
+    import tensorflow as tf
+
+    num_hidden = 10
+    output_dim = 2
+    input_dim = 10
+    target = 0.8
+    optimizer = tf.train.AdadeltaOptimizer(
+        learning_rate=1., rho=0.95, epsilon=1e-08)
+
+    (x_train, y_train), (x_test, y_test) = get_test_data(
+        num_train=1000, num_test=200,
+        input_shape=(input_dim,),
+        classification=True, num_classes=output_dim)
+
+    model = keras.Sequential()
+    model.add(layers.Dense(num_hidden,
+                           activation='relu',
+                           input_shape=(input_dim,)))
+    model.add(layers.Dense(output_dim, activation='softmax'))
+
+    model.compile(loss='sparse_categorical_crossentropy',
+                  optimizer=optimizer,
+                  metrics=['accuracy'])
+    history = model.fit(x_train, y_train, epochs=8, batch_size=16,
+                        validation_data=(x_test, y_test), verbose=2)
+    assert history.history['val_acc'][-1] >= target
+
+    # Test saving.
+    _, fname = tempfile.mkstemp('.h5')
+    model.save(fname)
+    model = keras.models.load_model(fname)
+    assert len(model.weights) == 4
+    os.remove(fname)
+
+
+if __name__ == '__main__':
+    pytest.main([__file__])
diff --git a/tests/integration_tests/test_vector_data_tasks.py b/tests/integration_tests/test_vector_data_tasks.py
index 9e7e226bfa4..ecf5e11dcb0 100644
--- a/tests/integration_tests/test_vector_data_tasks.py
+++ b/tests/integration_tests/test_vector_data_tasks.py
@@ -1,7 +1,7 @@
 from __future__ import print_function
 import pytest
 
-from keras.utils.test_utils import get_test_data, keras_test
+from keras.utils.test_utils import get_test_data
 from keras.models import Sequential
 from keras import layers
 import keras
@@ -11,7 +11,6 @@
 num_classes = 2
 
 
-@keras_test
 def test_vector_classification():
     '''
     Classify random float vectors into 2 classes with logistic regression
@@ -45,7 +44,6 @@ def test_vector_classification():
 
 
 @pytest.mark.skipif(K.backend() == 'mxnet', reason='MXNet backend does not support Sparse yet.')
-@keras_test
 def test_vector_classification_functional():
     (x_train, y_train), (x_test, y_test) = get_test_data(num_train=500,
                                                          num_test=200,
@@ -68,7 +66,6 @@ def test_vector_classification_functional():
     assert(history.history['val_acc'][-1] > 0.8)
 
 
-@keras_test
 def test_vector_regression():
     '''
     Perform float data prediction (regression) using 2 layer MLP
diff --git a/tests/keras/activations_test.py b/tests/keras/activations_test.py
index 5be59def8c8..4c0bb368303 100644
--- a/tests/keras/activations_test.py
+++ b/tests/keras/activations_test.py
@@ -79,6 +79,23 @@ def test_softmax_invalid():
         f = K.function([x], [activations.softmax(x)])
 
 
+def test_softmax_3d():
+    """Test using a reference implementation of softmax.
+    """
+    def softmax(values, axis):
+        m = np.max(values, axis=axis, keepdims=True)
+        e = np.exp(values - m)
+        return e / np.sum(e, axis=axis, keepdims=True)
+
+    x = K.placeholder(ndim=3)
+    f = K.function([x], [activations.softmax(x, axis=1)])
+    test_values = get_standard_values()[:, :, np.newaxis].copy()
+
+    result = f([test_values])[0]
+    expected = softmax(test_values, axis=1)
+    assert_allclose(result, expected, rtol=1e-05)
+
+
 def test_time_distributed_softmax():
     x = K.placeholder(shape=(1, 1, 5))
     f = K.function([x], [activations.softmax(x)])
@@ -163,6 +180,18 @@ def test_relu():
     result = f([test_values])[0]
     assert_allclose(result, test_values, rtol=1e-05)
 
+    # Test max_value
+    test_values = np.array([[0.5, 1.5]], dtype=K.floatx())
+    f = K.function([x], [activations.relu(x, max_value=1.)])
+    result = f([test_values])[0]
+    assert np.max(result) <= 1.
+
+    # Test max_value == 6.
+    test_values = np.array([[0.5, 6.]], dtype=K.floatx())
+    f = K.function([x], [activations.relu(x, max_value=1.)])
+    result = f([test_values])[0]
+    assert np.max(result) <= 6.
+
 
 def test_elu():
     x = K.placeholder(ndim=2)
diff --git a/tests/keras/backend/backend_test.py b/tests/keras/backend/backend_test.py
index 79bface0dbb..00fdef0cf8d 100644
--- a/tests/keras/backend/backend_test.py
+++ b/tests/keras/backend/backend_test.py
@@ -3,12 +3,11 @@
 import numpy as np
 import scipy.sparse as sparse
 import warnings
-from keras.utils.test_utils import keras_test
 
 from keras import backend as K
 from keras.backend import floatx, set_floatx, variable
 from keras.utils.conv_utils import convert_kernel
-import reference_operations
+import reference_operations as KNP
 
 
 BACKENDS = []  # Holds a list of all available back-ends
@@ -45,6 +44,9 @@
 BACKENDS_WITHOUT_MXNET = BACKENDS - set([KMX])
 
 
+WITH_NP = [KTH if K.backend() == 'theano' else KC if K.backend() == 'cntk' else KTF, KNP, KMX]
+
+
 def check_dtype(var, dtype):
     if K._BACKEND == 'theano' or K._BACKEND == 'mxnet':
         assert var.dtype == dtype
@@ -52,28 +54,20 @@ def check_dtype(var, dtype):
         assert var.dtype.name == '%s_ref' % dtype
 
 
-def cntk_func_single_tensor(function_name, x_shape, **kwargs):
-    xc = KC.placeholder(x_shape)
-    output_cntk = getattr(KC, function_name)(xc, **kwargs)
-    return KC.function([xc], [output_cntk])
-
-
-def cntk_func_two_tensor(function_name, x_shape, y, **kwargs):
-    if isinstance(y, (np.generic, np.ndarray)):
-        xc = KC.placeholder(x_shape)
-        output_cntk = getattr(KC, function_name)(xc, KC.variable(y), **kwargs)
-        return KC.function([xc], [output_cntk])
-    else:
-        xc = KC.placeholder(ndim=len(x_shape))
-        yc = KC.placeholder(y)
-        output_cntk = getattr(KC, function_name)(xc, yc, **kwargs)
-        return KC.function([xc, yc], [output_cntk])
-
+def cntk_func_tensors(function_name, shapes_or_vals, **kwargs):
+    placeholders = []
+    variables = []
+    for shape_or_val in shapes_or_vals:
+        if isinstance(shape_or_val, tuple):
+            shape = shape_or_val
+            placeholders.append(KC.placeholder(shape))
+        else:
+            value = shape_or_val
+            variables.append(KC.variable(value))
 
-def cntk_func_three_tensor(function_name, x_shape, y, z, **kwargs):
-    xc = KC.placeholder(x_shape)
-    output_cntk = getattr(KC, function_name)(xc, KC.variable(y), KC.variable(z), **kwargs)
-    return KC.function([xc], [output_cntk])
+    output_cntk = getattr(KC, function_name)(*(placeholders + variables), **kwargs)
+    cntk_func = KC.function(placeholders, [output_cntk])
+    return output_cntk, cntk_func
 
 
 def parse_shape_or_val(shape_or_val):
@@ -93,101 +87,79 @@ def assert_list_pairwise(z_list, shape=True, allclose=True, itself=False, atol=1
             assert z1 == z2
 
 
-def assert_list_with_ref(z_list, ref):
-    for z in z_list:
-        assert z.shape == ref.shape
-        assert_allclose(z, ref, atol=1e-05)
-
-
-def assert_list_keras_shape(z_list):
-    for z in z_list:
-        if hasattr(z, '_keras_shape'):
-            assert z._keras_shape == z.shape
+def assert_list_keras_shape(t_list, z_list):
+    for t, z in zip(t_list, z_list):
+        if hasattr(t, '_keras_shape') and len(t._keras_shape) > 1:
+            for i, s in enumerate(t._keras_shape):
+                if s:
+                    assert t._keras_shape[i] == z.shape[i]
 
 
-@keras_test
 def check_single_tensor_operation(function_name, x_shape_or_val, backend_list, **kwargs):
     shape_or_val = kwargs.pop('shape_or_val', True)
     assert_value_equality = kwargs.pop('assert_value_equality', True)
-    assert_value_with_ref = kwargs.pop('assert_value_with_ref', None)
     cntk_dynamicity = kwargs.pop('cntk_dynamicity', False)
-    return_results = kwargs.pop('return_results', False)
 
     if shape_or_val:
         x_shape, x_val = parse_shape_or_val(x_shape_or_val)
 
+    t_list = []
     z_list = []
     for k in backend_list:
         if shape_or_val:
             if (k == KC) & (cntk_dynamicity):
-                z = cntk_func_single_tensor(function_name, x_shape,
-                                            **kwargs)([x_val])[0]
+                t, f = cntk_func_tensors(function_name, [x_shape], **kwargs)
+                z = f([x_val])[0]
             else:
-                z = k.eval(getattr(k, function_name)(k.variable(x_val), **kwargs))
+                t = getattr(k, function_name)(k.variable(x_val), **kwargs)
+                z = k.eval(t)
         else:
-            z = k.eval(getattr(k, function_name)(x_shape_or_val, **kwargs))
+            t = getattr(k, function_name)(x_shape_or_val, **kwargs)
+            z = k.eval(t)
+        t_list += [t]
         z_list += [z]
 
-    if return_results:
-        if len(z_list) > 1:
-            return z_list
-        else:
-            return z_list[0]
-
-    if assert_value_with_ref is not None:
-        assert_list_with_ref(z_list, assert_value_with_ref)
-    else:
-        assert_list_pairwise(z_list, allclose=assert_value_equality)
-    assert_list_keras_shape(z_list)
+    assert_list_pairwise(z_list, allclose=assert_value_equality)
+    assert_list_keras_shape(t_list, z_list)
 
 
-@keras_test
 def check_two_tensor_operation(function_name, x_shape_or_val,
                                y_shape_or_val, backend_list, **kwargs):
-    shape_or_val = kwargs.pop('shape_or_val', True)
     concat_args = kwargs.pop('concat_args', False)
     cntk_dynamicity = kwargs.pop('cntk_dynamicity', False)
     cntk_two_dynamicity = kwargs.pop('cntk_two_dynamicity', False)
-    return_results = kwargs.pop('return_results', False)
 
-    if shape_or_val:
-        x_shape, x_val = parse_shape_or_val(x_shape_or_val)
-        y_shape, y_val = parse_shape_or_val(y_shape_or_val)
+    x_shape, x_val = parse_shape_or_val(x_shape_or_val)
+    y_shape, y_val = parse_shape_or_val(y_shape_or_val)
 
+    t_list = []
     z_list = []
     for k in backend_list:
-        if shape_or_val:
-            if (k == KC) & (cntk_dynamicity):
-                z = cntk_func_two_tensor(function_name, x_shape,
-                                         y=y_val, **kwargs)([x_val])[0]
-            elif (k == KC) & (cntk_two_dynamicity):
-                z = cntk_func_two_tensor(function_name, x_shape,
-                                         y=y_shape, **kwargs)([x_val, y_val])[0]
-            elif (k == KTH) & (function_name[:4] == 'conv'):
-                z = k.eval(getattr(k, function_name)(
-                    k.variable(x_val), k.variable(convert_kernel(y_val)), **kwargs))
-            elif concat_args:
-                z = k.eval(getattr(k, function_name)(
-                    [k.variable(x_val), k.variable(y_val)], **kwargs))
-            else:
-                z = k.eval(getattr(k, function_name)(
-                    k.variable(x_val), k.variable(y_val), **kwargs))
+        if (k == KC) & (cntk_dynamicity):
+            t, f = cntk_func_tensors(function_name, [x_shape, y_val], **kwargs)
+            z = f([x_val])[0]
+        elif (k == KC) & (cntk_two_dynamicity):
+            t, f = cntk_func_tensors(function_name, [x_shape, y_shape], **kwargs)
+            z = f([x_val, y_val])[0]
+        elif (k == KTH) & (function_name[:4] == 'conv'):
+            t = getattr(k, function_name)(
+                k.variable(x_val), k.variable(convert_kernel(y_val)), **kwargs)
+            z = k.eval(t)
+        elif concat_args:
+            t = getattr(k, function_name)(
+                [k.variable(x_val), k.variable(y_val)], **kwargs)
+            z = k.eval(t)
         else:
-            z = k.eval(getattr(k, function_name)(
-                x_shape_or_val, y_shape_or_val, **kwargs))
+            t = getattr(k, function_name)(
+                k.variable(x_val), k.variable(y_val), **kwargs)
+            z = k.eval(t)
+        t_list += [t]
         z_list += [z]
 
-    if return_results:
-        if len(z_list) > 1:
-            return z_list
-        else:
-            return z_list[0]
-
     assert_list_pairwise(z_list)
-    assert_list_keras_shape(z_list)
+    assert_list_keras_shape(t_list, z_list)
 
 
-@keras_test
 def check_composed_tensor_operations(first_function_name, first_function_args,
                                      second_function_name, second_function_args,
                                      input_shape, backend_list):
@@ -206,29 +178,38 @@ def check_composed_tensor_operations(first_function_name, first_function_args,
 class TestBackend(object):
 
     def test_is_keras_tensor(self):
-        for k in BACKENDS:
-            np_var = np.array([1, 2])
-            with pytest.raises(ValueError):
-                k.is_keras_tensor(np_var)
+        np_var = np.array([1, 2])
+        with pytest.raises(ValueError):
+            K.is_keras_tensor(np_var)
 
-            keras_var = k.variable(np_var)
-            assert k.is_keras_tensor(keras_var) is False
-            keras_placeholder = k.placeholder(shape=(2, 4, 5))
-            assert k.is_keras_tensor(keras_placeholder) is False
+        keras_var = K.variable(np_var)
+        assert K.is_keras_tensor(keras_var) is False
+        keras_placeholder = K.placeholder(shape=(2, 4, 5))
+        assert K.is_keras_tensor(keras_placeholder) is False
 
     def test_set_learning_phase(self):
         # not supported learning_phase
-        for k in BACKENDS:
-            with pytest.raises(ValueError):
-                k.set_learning_phase(2)
+        with pytest.raises(ValueError):
+            K.set_learning_phase(2)
 
     def test_eye(self):
-        z_list = [k.eval(k.eye(3)) for k in BACKENDS]
-        assert_list_pairwise(z_list)
+        check_single_tensor_operation('eye', 3, WITH_NP, shape_or_val=False)
+
+    def test_ones(self):
+        check_single_tensor_operation('ones', (3, 5, 10, 8), WITH_NP, shape_or_val=False)
+
+    def test_zeros(self):
+        check_single_tensor_operation('zeros', (3, 5, 10, 8), WITH_NP, shape_or_val=False)
+
+    def test_ones_like(self):
+        check_single_tensor_operation('ones_like', (3, 5, 10, 8), WITH_NP, shape_or_val=True)
+
+    def test_zeros_like(self):
+        check_single_tensor_operation('zeros_like', (3, 5, 10, 8), WITH_NP, shape_or_val=True)
 
     def test_linear_operations(self):
-        check_two_tensor_operation('dot', (4, 2), (2, 4), BACKENDS)
-        check_two_tensor_operation('dot', (4, 2), (5, 2, 3), BACKENDS)
+        check_two_tensor_operation('dot', (4, 2), (2, 4), WITH_NP)
+        check_two_tensor_operation('dot', (4, 2), (5, 2, 3), WITH_NP)
 
         # Theano has issues with batch_dot. Ignore THEANO backend for these tests.
         # https://github.com/Theano/Theano/issues/6518
@@ -247,15 +228,16 @@ def test_linear_operations(self):
         check_two_tensor_operation('batch_dot', (32, 20), (32, 20),
                                    BACKENDS_WITHOUT_THEANO, cntk_two_dynamicity=True, axes=(1, 1))
 
-        check_single_tensor_operation('transpose', (4, 2), BACKENDS)
-        check_single_tensor_operation('reverse', (4, 3, 2), BACKENDS, axes=1)
-        check_single_tensor_operation('reverse', (4, 3, 2), BACKENDS_WITHOUT_CNTK, axes=(1, 2))
+        check_single_tensor_operation('transpose', (4, 2), WITH_NP)
+        check_single_tensor_operation('reverse', (4, 3, 2), WITH_NP, axes=1)
+        if K.backend() != 'cntk':
+            check_single_tensor_operation('reverse', (4, 3, 2), WITH_NP, axes=(1, 2))
 
     def test_random_variables(self):
-        check_single_tensor_operation('random_uniform_variable', (2, 3), BACKENDS,
+        check_single_tensor_operation('random_uniform_variable', (2, 3), WITH_NP,
                                       low=0., high=1.,
                                       shape_or_val=False, assert_value_equality=False)
-        check_single_tensor_operation('random_normal_variable', (2, 3), BACKENDS,
+        check_single_tensor_operation('random_normal_variable', (2, 3), WITH_NP,
                                       mean=0., scale=1.,
                                       shape_or_val=False, assert_value_equality=False)
 
@@ -276,37 +258,39 @@ def test_batch_dot_shape(self):
         assert_allclose(K.eval(xy_batch_dot), np.ones((32, 1)) * 20, atol=1e-05)
 
     def test_shape_operations(self):
-        check_two_tensor_operation('concatenate', (4, 3), (4, 2), BACKENDS,
+        check_two_tensor_operation('concatenate', (4, 3), (4, 2), WITH_NP,
                                    axis=-1, concat_args=True)
 
-        check_single_tensor_operation('reshape', (4, 2), BACKENDS, shape=(8, 1))
-        check_single_tensor_operation('permute_dimensions', (4, 2, 3), BACKENDS,
+        check_single_tensor_operation('reshape', (4, 2), WITH_NP, shape=(8, 1))
+        check_single_tensor_operation('permute_dimensions', (4, 2, 3), WITH_NP,
                                       pattern=(2, 0, 1))
-        check_single_tensor_operation('repeat', (4, 1), BACKENDS, n=3)
-        check_single_tensor_operation('flatten', (4, 1), BACKENDS)
-        check_single_tensor_operation('batch_flatten', (20, 2, 5), BACKENDS,
+        check_single_tensor_operation('repeat', (4, 1), WITH_NP, n=3)
+        check_single_tensor_operation('flatten', (4, 1), WITH_NP)
+        check_single_tensor_operation('batch_flatten', (20, 2, 5), WITH_NP,
                                       cntk_dynamicity=True)
-        check_single_tensor_operation('expand_dims', (4, 3), BACKENDS, axis=-1)
-        check_single_tensor_operation('expand_dims', (4, 3, 2), BACKENDS, axis=1)
-        check_single_tensor_operation('squeeze', (4, 3, 1), BACKENDS, axis=2)
-        check_single_tensor_operation('squeeze', (4, 1, 1), BACKENDS, axis=1)
+        check_single_tensor_operation('expand_dims', (4, 3), WITH_NP, axis=-1)
+        check_single_tensor_operation('expand_dims', (4, 3, 2), WITH_NP, axis=1)
+        check_single_tensor_operation('squeeze', (4, 3, 1), WITH_NP, axis=2)
+        check_single_tensor_operation('squeeze', (4, 1, 1), WITH_NP, axis=1)
         check_composed_tensor_operations('reshape', {'shape': (4, 3, 1, 1)},
                                          'squeeze', {'axis': 2},
-                                         (4, 3, 1, 1), BACKENDS)
+                                         (4, 3, 1, 1), WITH_NP)
 
+    @pytest.mark.skipif(K.backend() != 'theano',
+                        reason='We only test the shape inference of the '
+                               'theano backend.')
     def test_none_shape_operations(self):
         # Test shape inference when input
         # shape has `None` entries
-        if K.backend() == 'theano':
-            x = KTH.placeholder((3, None, 4))
+        x = K.placeholder((3, None, 4))
 
-            y = KTH.batch_flatten(x)
-            if hasattr(y, '_keras_shape'):
-                assert y._keras_shape == (3, None)
+        y = K.batch_flatten(x)
+        if hasattr(y, '_keras_shape'):
+            assert y._keras_shape == (3, None)
 
-            y = KTH.flatten(x)
-            if hasattr(y, '_keras_shape'):
-                assert y._keras_shape == (None, )
+        y = K.flatten(x)
+        if hasattr(y, '_keras_shape'):
+            assert y._keras_shape == (None, )
 
     def test_repeat_elements(self):
         reps = 3
@@ -315,10 +299,8 @@ def test_repeat_elements(self):
             arr = np.arange(np.prod(shape)).reshape(shape)
 
             for rep_axis in range(ndims):
-                np_rep = np.repeat(arr, reps, axis=rep_axis)
-                check_single_tensor_operation('repeat_elements', arr, BACKENDS,
-                                              rep=reps, axis=rep_axis,
-                                              assert_value_with_ref=np_rep)
+                check_single_tensor_operation('repeat_elements', arr, WITH_NP,
+                                              rep=reps, axis=rep_axis)
 
                 if K.backend() != 'cntk' and K.backend() != 'mxnet':
                     shape = list(shape)
@@ -331,8 +313,8 @@ def test_repeat_elements(self):
     def test_tile(self):
         shape = (3, 4)
         arr = np.arange(np.prod(shape)).reshape(shape)
-        check_single_tensor_operation('tile', arr, BACKENDS, n=[2, 1])
-        check_single_tensor_operation('tile', (2, 5), BACKENDS, n=[5, 2])
+        check_single_tensor_operation('tile', arr, WITH_NP, n=[2, 1])
+        check_single_tensor_operation('tile', (2, 5), WITH_NP, n=[5, 2])
 
         # test theano shape inference when
         # input shape has None entries
@@ -349,11 +331,13 @@ def test_gather(self):
         shape = (10, 2, 3)
         ref = np.arange(np.prod(shape)).reshape(shape)
         inds = [1, 3, 7, 9]
+        t_list = [k.gather(k.variable(ref), k.variable(inds, dtype='int32'))
+                  for k in BACKENDS]
         z_list = [k.eval(k.gather(k.variable(ref), k.variable(inds, dtype='int32')))
                   for k in BACKENDS]
 
         assert_list_pairwise(z_list)
-        assert_list_keras_shape(z_list)
+        assert_list_keras_shape(t_list, z_list)
 
         # test theano shape inference when
         # input shape has None entries
@@ -383,68 +367,93 @@ def test_value_manipulation(self):
         check_single_tensor_operation('print_tensor', (1, 2, 3), BACKENDS)
 
     def test_elementwise_operations(self):
-        check_single_tensor_operation('max', (4, 2), BACKENDS)
-        check_single_tensor_operation('max', (4, 2), BACKENDS, axis=1, keepdims=True)
-
-        check_single_tensor_operation('min', (4, 2), BACKENDS)
-        check_single_tensor_operation('min', (4, 2), BACKENDS, axis=1, keepdims=True)
-        check_single_tensor_operation('min', (4, 2, 3), BACKENDS, axis=[1, -1])
-
-        check_single_tensor_operation('mean', (4, 2), BACKENDS)
-        check_single_tensor_operation('mean', (4, 2), BACKENDS, axis=1, keepdims=True)
-        check_single_tensor_operation('mean', (4, 2, 3), BACKENDS, axis=-1, keepdims=True)
-        check_single_tensor_operation('mean', (4, 2, 3), BACKENDS, axis=[1, -1])
-
-        check_single_tensor_operation('std', (4, 2), BACKENDS)
-        check_single_tensor_operation('std', (4, 2), BACKENDS, axis=1, keepdims=True)
-        check_single_tensor_operation('std', (4, 2, 3), BACKENDS, axis=[1, -1])
-
-        check_single_tensor_operation('prod', (4, 2), BACKENDS)
-        check_single_tensor_operation('prod', (4, 2), BACKENDS, axis=1, keepdims=True)
-        check_single_tensor_operation('prod', (4, 2, 3), BACKENDS, axis=[1, -1])
-
-        # cntk does not support cumsum and cumprod yet
-        check_single_tensor_operation('cumsum', (4, 2), [KTF, KTH])
-        check_single_tensor_operation('cumsum', (4, 2), [KTF, KTH], axis=1)
-
-        check_single_tensor_operation('cumprod', (4, 2), [KTF, KTH])
-        check_single_tensor_operation('cumprod', (4, 2), [KTF, KTH], axis=1)
-
-        check_single_tensor_operation('any', (4, 2), BACKENDS)
-        check_single_tensor_operation('any', (4, 2), BACKENDS, axis=1, keepdims=True)
-
-        check_single_tensor_operation('all', (4, 2), BACKENDS)
-        check_single_tensor_operation('all', (4, 2), BACKENDS, axis=1, keepdims=True)
-
-        check_single_tensor_operation('argmax', (4, 2), BACKENDS)
-        check_single_tensor_operation('argmax', (4, 2), BACKENDS, axis=1)
-
-        check_single_tensor_operation('argmin', (4, 2), BACKENDS)
-        check_single_tensor_operation('argmin', (4, 2), BACKENDS, axis=1)
-
-        check_single_tensor_operation('square', (4, 2), BACKENDS)
-        check_single_tensor_operation('abs', (4, 2), BACKENDS)
-        check_single_tensor_operation('sqrt', (4, 2), BACKENDS)
-        check_single_tensor_operation('exp', (4, 2), BACKENDS)
-        # cntk return -85.1 for zero or negative number, not nan, so can't compare with other backend.
-        check_single_tensor_operation('log', (4, 2), [KTH, KTF])
-        check_single_tensor_operation('round', (4, 2), BACKENDS)
-        check_single_tensor_operation('sign', (4, 2), BACKENDS)
-        check_single_tensor_operation('pow', (4, 2), BACKENDS, a=3)
-        check_single_tensor_operation('clip', (4, 2), BACKENDS, min_value=0.4,
+        check_single_tensor_operation('max', (4, 2), WITH_NP)
+        check_single_tensor_operation('max', (4, 2), WITH_NP, axis=1, keepdims=True)
+        check_single_tensor_operation('max', (4, 2, 3), WITH_NP, axis=[1, -1])
+
+        check_single_tensor_operation('min', (4, 2), WITH_NP)
+        check_single_tensor_operation('min', (4, 2), WITH_NP, axis=1, keepdims=True)
+        check_single_tensor_operation('min', (4, 2, 3), WITH_NP, axis=[1, -1])
+
+        check_single_tensor_operation('mean', (4, 2), WITH_NP)
+        check_single_tensor_operation('mean', (4, 2), WITH_NP, axis=1, keepdims=True)
+        check_single_tensor_operation('mean', (4, 2, 3), WITH_NP, axis=-1, keepdims=True)
+        check_single_tensor_operation('mean', (4, 2, 3), WITH_NP, axis=[1, -1])
+
+        check_single_tensor_operation('var', (4, 2), WITH_NP)
+        check_single_tensor_operation('var', (4, 2), WITH_NP, axis=1, keepdims=True)
+        check_single_tensor_operation('var', (4, 2, 3), WITH_NP, axis=[1, -1])
+
+        check_single_tensor_operation('std', (4, 2), WITH_NP)
+        check_single_tensor_operation('std', (4, 2), WITH_NP, axis=1, keepdims=True)
+        check_single_tensor_operation('std', (4, 2, 3), WITH_NP, axis=[1, -1])
+        # check_single_tensor_operation('std', (4, 2, 3), BACKENDS, axis=[1, -1])
+
+        # MXNet backend does not support logsumexp yet
+        check_single_tensor_operation('logsumexp', (4, 2), BACKENDS_WITHOUT_MXNET)
+        check_single_tensor_operation('logsumexp', (4, 2), BACKENDS_WITHOUT_MXNET, axis=1, keepdims=True)
+        check_single_tensor_operation('logsumexp', (4, 2, 3), BACKENDS_WITHOUT_MXNET, axis=[1, -1])
+
+        check_single_tensor_operation('prod', (4, 2), WITH_NP)
+        check_single_tensor_operation('prod', (4, 2), WITH_NP, axis=1, keepdims=True)
+        check_single_tensor_operation('prod', (4, 2, 3), WITH_NP, axis=[1, -1])
+
+        check_single_tensor_operation('any', (4, 2), WITH_NP)
+        check_single_tensor_operation('any', (4, 2), WITH_NP, axis=1, keepdims=True)
+        check_single_tensor_operation('any', (4, 2, 3), WITH_NP, axis=[1, -1])
+
+        check_single_tensor_operation('all', (4, 2), WITH_NP)
+        check_single_tensor_operation('all', (4, 2), WITH_NP, axis=1, keepdims=True)
+        check_single_tensor_operation('all', (4, 2, 3), WITH_NP, axis=[1, -1])
+
+        check_single_tensor_operation('argmax', (4, 2), WITH_NP)
+        check_single_tensor_operation('argmax', (4, 2), WITH_NP, axis=1)
+
+        check_single_tensor_operation('argmin', (4, 2), WITH_NP)
+        check_single_tensor_operation('argmin', (4, 2), WITH_NP, axis=1)
+
+        check_single_tensor_operation('square', (4, 2), WITH_NP)
+        check_single_tensor_operation('abs', (4, 2), WITH_NP)
+        check_single_tensor_operation('sqrt', (4, 2), WITH_NP)
+        check_single_tensor_operation('exp', (4, 2), WITH_NP)
+
+        check_single_tensor_operation('round', (4, 2), WITH_NP)
+        check_single_tensor_operation('sign', (4, 2), WITH_NP)
+        check_single_tensor_operation('pow', (4, 2), WITH_NP, a=3)
+        check_single_tensor_operation('clip', (4, 2), WITH_NP, min_value=0.4,
                                       max_value=0.6)
 
+        check_single_tensor_operation('cos', (4, 2), WITH_NP)
+        check_single_tensor_operation('sin', (4, 2), WITH_NP)
+
         # two-tensor ops
-        check_two_tensor_operation('equal', (4, 2), (4, 2), BACKENDS)
-        check_two_tensor_operation('not_equal', (4, 2), (4, 2), BACKENDS)
-        check_two_tensor_operation('greater', (4, 2), (4, 2), BACKENDS)
-        check_two_tensor_operation('greater_equal', (4, 2), (4, 2), BACKENDS)
-        check_two_tensor_operation('less', (4, 2), (4, 2), BACKENDS)
-        check_two_tensor_operation('less_equal', (4, 2), (4, 2), BACKENDS)
-        check_two_tensor_operation('maximum', (4, 2), (4, 2), BACKENDS)
-        check_two_tensor_operation('minimum', (4, 2), (4, 2), BACKENDS)
-
-    # cntk doesn't support gradient in this way
+        check_two_tensor_operation('equal', (4, 2), (4, 2), WITH_NP)
+        check_two_tensor_operation('not_equal', (4, 2), (4, 2), WITH_NP)
+        check_two_tensor_operation('greater', (4, 2), (4, 2), WITH_NP)
+        check_two_tensor_operation('greater_equal', (4, 2), (4, 2), WITH_NP)
+        check_two_tensor_operation('less', (4, 2), (4, 2), WITH_NP)
+        check_two_tensor_operation('less_equal', (4, 2), (4, 2), WITH_NP)
+        check_two_tensor_operation('maximum', (4, 2), (4, 2), WITH_NP)
+        check_two_tensor_operation('minimum', (4, 2), (4, 2), WITH_NP)
+
+    @pytest.mark.skipif(K.backend() == 'cntk' or K.backend() == 'mxnet',
+                        reason='cntk does not support cumsum and cumprod yet')
+    def test_cumsum_cumprod(self):
+        check_single_tensor_operation('cumsum', (4, 2), WITH_NP)
+        check_single_tensor_operation('cumsum', (4, 2), WITH_NP, axis=1)
+
+        check_single_tensor_operation('cumprod', (4, 2), WITH_NP)
+        check_single_tensor_operation('cumprod', (4, 2), WITH_NP, axis=1)
+
+    @pytest.mark.skipif(K.backend() == 'cntk',
+                        reason='cntk return -85.1 for zero or '
+                               'negative number, not nan, so can\'t '
+                               'compare with other backend.')
+    def test_log(self):
+        check_single_tensor_operation('log', (4, 2), WITH_NP)
+
+    @pytest.mark.skipif(K.backend() == 'cntk',
+                        reason='cntk doesn\'t support gradient in this way.')
     def test_gradient(self):
         val = np.random.random((4, 2))
         x_list = [k.variable(val) for k in [KTH, KTF]]
@@ -469,13 +478,14 @@ def test_stop_gradient(self):
         # It doesn't check the functionality (which is checked at the
         # test_gradient test).
         val = np.random.random((4, 2))
-        for k in BACKENDS:
-            a = k.variable(val)
-            b = k.square(a)
-            c, d = k.stop_gradient([a, b])
-            e = k.stop_gradient(b)
-
-    # cntk currently not support function in this way, so can't test as this
+        a = K.variable(val)
+        b = K.square(a)
+        c, d = K.stop_gradient([a, b])
+        e = K.stop_gradient(b)
+
+    @pytest.mark.skipif(K.backend() == 'cntk',
+                        reason='cntk currently not support function in this '
+                               'way, so can\'t test as this.')
     def test_function(self):
         test_backend = [KTH, KTF]
         val = np.random.random((4, 2))
@@ -498,6 +508,8 @@ def test_function(self):
         new_val_list = [k.get_value(x) for x, k in zip(x_list, test_backend)]
         assert_list_pairwise(new_val_list)
 
+    @pytest.mark.skipif(K.backend() != 'tensorflow',
+                        reason='Uses the `fetches` argument.')
     def test_function_tf_fetches(self):
         # Additional operations can be passed to tf.Session().run() via its
         # `fetches` arguments. In contrast to `updates` argument of
@@ -505,19 +517,21 @@ def test_function_tf_fetches(self):
         # they can run in parallel. Also they should not contribute to output of
         # KTF.function().
 
-        x = KTF.variable(0.)
-        y = KTF.variable(0.)
-        x_placeholder = KTF.placeholder(shape=())
-        y_placeholder = KTF.placeholder(shape=())
+        x = K.variable(0.)
+        y = K.variable(0.)
+        x_placeholder = K.placeholder(shape=())
+        y_placeholder = K.placeholder(shape=())
 
-        f = KTF.function(inputs=[x_placeholder, y_placeholder],
-                         outputs=[x_placeholder + y_placeholder],
-                         updates=[(x, x_placeholder + 1.)],
-                         fetches=[KTF.update(y, 5.)])
+        f = K.function(inputs=[x_placeholder, y_placeholder],
+                       outputs=[x_placeholder + y_placeholder],
+                       updates=[(x, x_placeholder + 1.)],
+                       fetches=[K.update(y, 5.)])
         output = f([10., 20.])
         assert output == [30.]
-        assert KTF.get_session().run(fetches=[x, y]) == [11., 5.]
+        assert K.get_session().run(fetches=[x, y]) == [11., 5.]
 
+    @pytest.mark.skipif(K.backend() != 'tensorflow',
+                        reason='Uses the `feed_dict` argument.')
     def test_function_tf_feed_dict(self):
         # Additional substitutions can be passed to `tf.Session().run()` via its
         # `feed_dict` arguments. Note that the feed_dict is passed once in the
@@ -525,35 +539,62 @@ def test_function_tf_feed_dict(self):
         # this feed_dict we can provide additional substitutions besides Keras
         # inputs.
 
-        x = KTF.variable(0.)
-        y = KTF.variable(0.)
-        x_placeholder = KTF.placeholder(shape=())
-        y_placeholder = KTF.placeholder(shape=())
+        x = K.variable(0.)
+        y = K.variable(0.)
+        x_placeholder = K.placeholder(shape=())
+        y_placeholder = K.placeholder(shape=())
 
         feed_dict = {y_placeholder: 3.}
 
-        f = KTF.function(inputs=[x_placeholder],
-                         outputs=[x_placeholder + 1.],
-                         updates=[(x, x_placeholder + 10.)],
-                         feed_dict=feed_dict,
-                         fetches=[KTF.update(y, y_placeholder * 10.)])
+        f = K.function(inputs=[x_placeholder],
+                       outputs=[x_placeholder + 1.],
+                       updates=[(x, x_placeholder + 10.)],
+                       feed_dict=feed_dict,
+                       fetches=[K.update(y, y_placeholder * 10.)])
         output = f([10.])
         assert output == [11.]
-        assert KTF.get_session().run(fetches=[x, y]) == [20., 30.]
+        assert K.get_session().run(fetches=[x, y]) == [20., 30.]
 
         # updated value in feed_dict will be modified within the K.function()
         feed_dict[y_placeholder] = 4.
         output = f([20.])
         assert output == [21.]
-        assert KTF.get_session().run(fetches=[x, y]) == [30., 40.]
+        assert K.get_session().run(fetches=[x, y]) == [30., 40.]
+
+    @pytest.mark.skipif(K.backend() != 'tensorflow',
+                        reason='Uses the `options` and `run_metadata` arguments.')
+    def test_function_tf_run_options_with_run_metadata(self):
+        from tensorflow.core.protobuf import config_pb2
+        x_placeholder = K.placeholder(shape=())
+        y_placeholder = K.placeholder(shape=())
+
+        run_options = config_pb2.RunOptions(output_partition_graphs=True)
+        run_metadata = config_pb2.RunMetadata()
+        # enable run_options.
+        f = K.function(inputs=[x_placeholder, y_placeholder],
+                       outputs=[x_placeholder + y_placeholder],
+                       options=run_options,
+                       run_metadata=run_metadata)
+        output = f([10., 20.])
+        assert output == [30.]
+        assert len(run_metadata.partition_graphs) > 0
+        # disable run_options.
+        f = K.function(inputs=[x_placeholder, y_placeholder],
+                       outputs=[x_placeholder + y_placeholder],
+                       run_metadata=run_metadata)
+        output = f([10., 20.])
+        assert output == [30.]
+        assert len(run_metadata.partition_graphs) == 0
 
+    @pytest.mark.skipif(K.backend() != 'tensorflow',
+                        reason='Uses the `string` type for a tensor.')
     def test_function_tf_string_input(self):
         # Test functions with string inputs.
 
-        x_placeholder = KTF.placeholder(shape=(), dtype="string")
-        x_identity = KTF.identity(x_placeholder)
+        x_placeholder = K.placeholder(shape=(), dtype="string")
+        x_identity = K.identity(x_placeholder)
 
-        f = KTF.function(inputs=[x_placeholder], outputs=[x_identity])
+        f = K.function(inputs=[x_placeholder], outputs=[x_identity])
         output = f([b'test'])
         assert output == [b'test']
 
@@ -596,7 +637,7 @@ def rnn_fn(x_k, h_k):
         ]
 
         for (i, kwargs) in enumerate(kwargs_list):
-            last_y1, y1, h1 = reference_operations.rnn(x, [wi, wh, None], h0, **kwargs)
+            last_y1, y1, h1 = KNP.rnn(x, [wi, wh, None], h0, **kwargs)
             last_y2, y2, h2 = K.rnn(rnn_fn, x_k, h0_k, **kwargs)
 
             assert len(h2) == 1
@@ -666,7 +707,7 @@ def rnn_fn(x_k, h_k):
         ]
 
         for (i, kwargs) in enumerate(kwargs_list):
-            last_y1, y1, h1 = reference_operations.rnn(x, [wi, wh, None], h0, **kwargs)
+            last_y1, y1, h1 = KNP.rnn(x, [wi, wh, None], h0, **kwargs)
             last_y2, y2, h2 = K.rnn(rnn_fn, x_k, h0_k, **kwargs)
 
             assert len(h2) == 2
@@ -719,8 +760,8 @@ def rnn_fn(x_k, h_k):
             y_k = K.dot(x_k, wi_k)
             return y_k, []
 
-        last_y1, y1, h1 = reference_operations.rnn(x, [wi, None, None], None,
-                                                   go_backwards=False, mask=None)
+        last_y1, y1, h1 = KNP.rnn(x, [wi, None, None], None,
+                                  go_backwards=False, mask=None)
         last_y2, y2, h2 = K.rnn(rnn_fn, x_k, [],
                                 go_backwards=False, mask=None)
 
@@ -907,7 +948,7 @@ def test_switch(self):
         # scalar
         val = np.random.random()
         z_list = []
-        for k in BACKENDS:
+        for k in WITH_NP:
             x = k.variable(val)
             x = k.switch(k.greater_equal(x, 0.5), x * 0.1, x * 0.2)
             z_list.append(k.eval(x))
@@ -920,7 +961,7 @@ def test_switch(self):
         for s in shapes:
             z_list = []
             arrays = list(map(np.random.random, s))
-            for k in BACKENDS:
+            for k in WITH_NP:
                 x, then_expr, else_expr = map(k.variable, arrays)
                 cond = k.greater_equal(x, 0.5)
                 z_list.append(k.eval(k.switch(cond, then_expr, else_expr)))
@@ -944,37 +985,56 @@ def test_dropout(self):
             assert np.abs(z_list[i].mean() - z_list[i + 1].mean()) < 0.05
 
         # Test invalid use cases
-        for k in BACKENDS:
-            with pytest.raises(ValueError):
-                z = k.dropout(k.variable(val), level=-0.5)
+        with pytest.raises(ValueError):
+            z = K.dropout(K.variable(val), level=-0.5)
+
+    @pytest.mark.parametrize('alpha,max_value,threshold', [
+        (0.0, None, 0.0),  # standard relu
+        (0.1, None, 0.0),  # set alpha only
+        (0.0, 5.0, 0.0),   # set max_value only
+        (0.0, None, 0.8),  # set threshold only
+        (0.1, 5.0, 0.0),   # set alpha and max_value
+        (0.1, None, 0.8),  # set alpha and threshold
+        (0.0, 5.0, 0.8),   # set max_value and threshold
+        (0.1, 5.0, 0.8),   # set all
+        (0.1, 0.0, 0.8),   # max_value is zero
+        (0.1, 5.0, -2.8),  # threshold is negative
+        (0.1, 9.0, 0.8),   # max_value > 6
+    ])
+    def test_relu(self, alpha, max_value, threshold):
+        check_single_tensor_operation('relu', (4, 2), WITH_NP, alpha=alpha,
+                                      max_value=max_value, threshold=threshold)
 
     def test_nn_operations(self):
-        check_single_tensor_operation('relu', (4, 2), BACKENDS, alpha=0.1, max_value=0.5)
-        check_single_tensor_operation('softplus', (4, 10), BACKENDS)
-        check_single_tensor_operation('elu', (4, 10), BACKENDS, alpha=0.5)
+        check_single_tensor_operation('softplus', (4, 10), WITH_NP)
+        check_single_tensor_operation('elu', (4, 10), WITH_NP, alpha=0.5)
+
+        check_single_tensor_operation('sigmoid', (4, 2), WITH_NP)
+        check_single_tensor_operation('hard_sigmoid', (4, 2), WITH_NP)
+        check_single_tensor_operation('tanh', (4, 2), WITH_NP)
 
-        check_single_tensor_operation('sigmoid', (4, 2), BACKENDS)
-        check_single_tensor_operation('hard_sigmoid', (4, 2), BACKENDS)
-        check_single_tensor_operation('tanh', (4, 2), BACKENDS)
+        check_single_tensor_operation('softmax', (4, 10), WITH_NP)
+        check_single_tensor_operation('softmax', (4, 5, 3), WITH_NP, axis=1)
+        check_single_tensor_operation('softmax', (4, 5, 3, 10), WITH_NP, axis=2)
 
-        # MXNet backend has issues with softmax_crossentropy
-        check_two_tensor_operation('binary_crossentropy', (4, 2), (4, 2), BACKENDS_WITHOUT_MXNET, from_logits=True)
+        check_two_tensor_operation('binary_crossentropy', (4, 2), (4, 2), WITH_NP, from_logits=True)
         # cross_entropy call require the label is a valid probability distribution,
         # otherwise it is garbage in garbage out...
         # due to the algo difference, we can't guarantee CNTK has the same result on the garbage input.
         # so create a separate test case for valid label input
-        check_two_tensor_operation('categorical_crossentropy', (4, 2), (4, 2), BACKENDS, from_logits=True)
+        if K.backend() != 'cntk':
+            check_two_tensor_operation('categorical_crossentropy', (4, 2), (4, 2), WITH_NP, from_logits=True)
         xval = np.asarray([[0.26157712, 0.0432167], [-0.43380741, 0.30559841],
                            [0.20225059, -0.38956559], [-0.13805378, 0.08506755]], dtype=np.float32)
         yval = np.asarray([[0.46221867, 0.53778133], [0.51228984, 0.48771016],
                            [0.64916514, 0.35083486], [0.47028078, 0.52971922]], dtype=np.float32)
         check_two_tensor_operation('categorical_crossentropy', yval, xval,
-                                   BACKENDS, cntk_two_dynamicity=True, from_logits=True)
-        check_two_tensor_operation('binary_crossentropy', (4, 2), (4, 2), BACKENDS, from_logits=False)
-        check_two_tensor_operation('categorical_crossentropy', (4, 2), (4, 2), BACKENDS, from_logits=False)
+                                   WITH_NP, cntk_two_dynamicity=True, from_logits=True)
+        check_two_tensor_operation('binary_crossentropy', (4, 2), (4, 2), WITH_NP, from_logits=False)
+        check_two_tensor_operation('categorical_crossentropy', (4, 2), (4, 2), WITH_NP, from_logits=False)
 
-        check_single_tensor_operation('l2_normalize', (4, 3), BACKENDS, axis=-1)
-        check_single_tensor_operation('l2_normalize', (4, 3), BACKENDS, axis=1)
+        check_single_tensor_operation('l2_normalize', (4, 3), WITH_NP, axis=-1)
+        check_single_tensor_operation('l2_normalize', (4, 3), WITH_NP, axis=1)
 
     def test_in_top_k(self):
         batch_size = 20
@@ -1008,26 +1068,69 @@ def test_in_top_k(self):
     @pytest.mark.parametrize('op,input_shape,kernel_shape,padding,data_format', [
         ('conv1d', (2, 8, 2), (3, 2, 3), 'same', 'channels_last'),
         ('conv1d', (1, 8, 2), (3, 2, 3), 'valid', 'channels_last'),
+        ('conv1d', (1, 2, 8), (3, 2, 3), 'valid', 'channels_first'),
         ('conv2d', (2, 3, 4, 5), (3, 3, 3, 2), 'same', 'channels_first'),
         ('conv2d', (2, 3, 5, 6), (4, 3, 3, 4), 'valid', 'channels_first'),
         ('conv2d', (1, 6, 5, 3), (3, 4, 3, 2), 'valid', 'channels_last'),
         ('conv2d', (1, 7, 6, 3), (3, 3, 3, 4), 'same', 'channels_last'),
         ('conv3d', (2, 3, 4, 5, 4), (3, 3, 3, 3, 4), 'same', 'channels_first'),
-        ('conv3d', (2, 3, 5, 4, 6), (3, 2, 4, 3, 4), 'valid', 'channels_first'),
         ('conv3d', (1, 2, 2, 2, 1), (2, 2, 2, 1, 1), 'valid', 'channels_last'),
         ('conv3d', (1, 3, 5, 4, 2), (3, 3, 3, 2, 3), 'same', 'channels_last'),
     ])
     def test_conv(self, op, input_shape, kernel_shape, padding, data_format):
-        k = K.backend()
-        _, x = parse_shape_or_val(input_shape)
-        _, w = parse_shape_or_val(kernel_shape)
-        y1 = reference_operations.conv(x, w, padding, data_format)
-        y2 = check_two_tensor_operation(
-            op, x, w, [KTH if k == 'theano' else KC if k == 'cntk' else KTF],
+        check_two_tensor_operation(
+            op, input_shape, kernel_shape, [KMX],
             padding=padding, data_format=data_format,
-            cntk_dynamicity=True, return_results=True)
-        assert_allclose(y1, y2, atol=1e-05)
-
+            cntk_dynamicity=True)
+
+    @pytest.mark.parametrize(
+        'op,input_shape,kernel_shape,output_shape,padding,data_format', [
+            ('conv2d_transpose', (2, 5, 6, 3), (3, 3, 2, 3), (2, 5, 6, 2),
+             'same', 'channels_last'),
+            ('conv2d_transpose', (2, 3, 8, 9), (3, 3, 2, 3), (2, 2, 8, 9),
+             'same', 'channels_first'),
+        ])
+    def test_conv_transpose(self, op, input_shape, kernel_shape, output_shape,
+                            padding, data_format):
+        check_two_tensor_operation(
+            op, input_shape, kernel_shape, BACKENDS_WITHOUT_MXNET,
+            output_shape=output_shape, padding=padding, data_format=data_format,
+            cntk_dynamicity=True)
+
+    @pytest.mark.skipif((K.backend() == 'cntk' and K.dev.type() == 0),
+                        reason='cntk only supports dilated conv on GPU')
+    @pytest.mark.parametrize('op,input_shape,kernel_shape,padding,data_format,dilation_rate', [
+        ('conv1d', (2, 8, 3), (4, 3, 2), 'valid', 'channels_last', 2),
+        ('conv1d', (2, 3, 8), (4, 3, 2), 'valid', 'channels_first', 2),
+        ('conv2d', (2, 8, 9, 3), (3, 3, 3, 2), 'same', 'channels_last', (2, 2)),
+        ('conv2d', (2, 3, 9, 8), (4, 3, 3, 4), 'valid', 'channels_first', (2, 2)),
+        ('conv3d', (2, 5, 4, 6, 3), (2, 2, 3, 3, 4), 'valid', 'channels_last', (2, 2, 2)),
+        ('conv3d', (2, 3, 5, 4, 6), (2, 2, 3, 3, 4), 'same', 'channels_first', (2, 2, 2)),
+    ])
+    def test_dilated_conv(self, op, input_shape, kernel_shape, padding,
+                          data_format, dilation_rate):
+        check_two_tensor_operation(
+            op, input_shape, kernel_shape, BACKENDS_WITHOUT_MXNET,
+            padding=padding, data_format=data_format,
+            dilation_rate=dilation_rate, cntk_dynamicity=True)
+
+    @pytest.mark.skipif((K.backend() == 'cntk' and K.dev.type() == 0),
+                        reason='cntk only supports dilated conv transpose on GPU')
+    @pytest.mark.parametrize(
+        'op,input_shape,kernel_shape,output_shape,padding,data_format,dilation_rate', [
+            ('conv2d_transpose', (2, 5, 6, 3), (3, 3, 2, 3), (2, 5, 6, 2),
+             'same', 'channels_last', (2, 2)),
+            ('conv2d_transpose', (2, 3, 8, 9), (3, 3, 2, 3), (2, 2, 8, 9),
+             'same', 'channels_first', (2, 2)),
+        ])
+    def test_dilated_conv_transpose(self, op, input_shape, kernel_shape, output_shape,
+                                    padding, data_format, dilation_rate):
+        check_two_tensor_operation(
+            op, input_shape, kernel_shape, BACKENDS_WITHOUT_MXNET, output_shape=output_shape,
+            padding=padding, data_format=data_format, dilation_rate=dilation_rate,
+            cntk_dynamicity=True)
+
+    @pytest.mark.skipif(K.backend() == 'mxnet', reason='MXNet Backend: does not support depth multiplier > 1.')
     @pytest.mark.parametrize('op,input_shape,kernel_shape,padding,data_format', [
         ('depthwise_conv2d', (2, 3, 4, 5), (3, 3, 3, 2), 'same', 'channels_first'),
         ('depthwise_conv2d', (2, 3, 5, 6), (4, 3, 3, 4), 'valid', 'channels_first'),
@@ -1035,36 +1138,25 @@ def test_conv(self, op, input_shape, kernel_shape, padding, data_format):
         ('depthwise_conv2d', (1, 7, 6, 3), (3, 3, 3, 4), 'same', 'channels_last'),
     ])
     def test_depthwise_conv(self, op, input_shape, kernel_shape, padding, data_format):
-        k = K.backend()
-        _, x = parse_shape_or_val(input_shape)
-        _, w = parse_shape_or_val(kernel_shape)
-        y1 = reference_operations.depthwise_conv(x, w, padding, data_format)
-        y2 = check_two_tensor_operation(
-            op, x, w, [KTH if k == 'theano' else KC if k == 'cntk' else KTF],
+        check_two_tensor_operation(
+            op, input_shape, kernel_shape, WITH_NP,
             padding=padding, data_format=data_format,
-            cntk_dynamicity=True, return_results=True)
-        assert_allclose(y1, y2, atol=1e-05)
+            cntk_dynamicity=True)
 
     @pytest.mark.parametrize('op,input_shape,pool_size,strides,padding,data_format,pool_mode', [
-        ('pool2d', (2, 3, 7, 7), (3, 3), (1, 1), 'same', 'channels_first', 'avg'),
         ('pool2d', (3, 3, 8, 5), (2, 3), (1, 1), 'valid', 'channels_first', 'max'),
         ('pool2d', (2, 9, 5, 3), (3, 2), (1, 1), 'valid', 'channels_last', 'avg'),
         ('pool2d', (3, 6, 7, 3), (3, 3), (1, 1), 'same', 'channels_last', 'max'),
-        ('pool3d', (2, 3, 7, 7, 7), (3, 3, 3), (1, 1, 1), 'same', 'channels_first', 'avg'),
         ('pool3d', (3, 3, 8, 5, 9), (2, 3, 2), (1, 1, 1), 'valid', 'channels_first', 'max'),
         ('pool3d', (2, 8, 9, 5, 3), (3, 2, 3), (1, 1, 1), 'valid', 'channels_last', 'avg'),
         ('pool3d', (3, 5, 6, 7, 3), (3, 3, 3), (1, 1, 1), 'same', 'channels_last', 'max'),
     ])
     def test_pool(self, op, input_shape, pool_size, strides, padding, data_format, pool_mode):
-        k = K.backend()
-        _, x = parse_shape_or_val(input_shape)
-        y1 = reference_operations.pool(x, pool_size, strides, padding, data_format, pool_mode)
-        y2 = check_single_tensor_operation(
-            op, x, [KTH if k == 'theano' else KC if k == 'cntk' else KTF],
+        check_single_tensor_operation(
+            op, input_shape, WITH_NP,
             pool_size=pool_size, strides=strides,
             padding=padding, data_format=data_format, pool_mode=pool_mode,
-            cntk_dynamicity=True, return_results=True)
-        assert_allclose(y1, y2, atol=1e-05)
+            cntk_dynamicity=True)
 
     def legacy_test_conv1d(self):
         # channels_last input shape: (n, length, input_depth)
@@ -1136,12 +1228,13 @@ def test_separable_conv(self, op, input_shape, kernel_shape, depth_multiplier, p
         _, x = parse_shape_or_val(input_shape)
         _, depthwise = parse_shape_or_val(kernel_shape + (input_depth, depth_multiplier))
         _, pointwise = parse_shape_or_val((1,) * len(kernel_shape) + (input_depth * depth_multiplier, 7))
-        y1 = reference_operations.separable_conv(x, depthwise, pointwise, padding, data_format)
+        y1 = KNP.separable_conv(x, depthwise, pointwise,
+                                padding=padding, data_format=data_format)
         if K.backend() == 'cntk':
-            y2 = cntk_func_three_tensor(
-                op, input_shape,
-                depthwise, pointwise,
-                padding=padding, data_format=data_format)([x])[0]
+            _, cntk_func = cntk_func_tensors(
+                op, [input_shape, depthwise, pointwise],
+                padding=padding, data_format=data_format)
+            y2 = cntk_func([x])[0]
         else:
             y2 = K.eval(getattr(K, op)(
                 K.variable(x),
@@ -1197,49 +1290,46 @@ def legacy_test_pool3d(self):
 
     def test_random_normal(self):
         # test standard normal as well as a normal with a different set of parameters
-        for k in BACKENDS:
-            for mean, std in [(0., 1.), (-10., 5.)]:
-                rand = k.eval(k.random_normal((300, 200), mean=mean, stddev=std, seed=1337))
-                assert rand.shape == (300, 200)
-                assert np.abs(np.mean(rand) - mean) < std * 0.015
-                assert np.abs(np.std(rand) - std) < std * 0.015
-
-                # test that random_normal also generates different values when used within a function
-                r = k.random_normal((1,), mean=mean, stddev=std, seed=1337)
-                samples = [k.eval(r) for _ in range(60000)]
-                assert np.abs(np.mean(samples) - mean) < std * 0.015
-                assert np.abs(np.std(samples) - std) < std * 0.015
+        for mean, std in [(0., 1.), (-10., 5.)]:
+            rand = K.eval(K.random_normal((300, 200), mean=mean, stddev=std, seed=1337))
+            assert rand.shape == (300, 200)
+            assert np.abs(np.mean(rand) - mean) < std * 0.015
+            assert np.abs(np.std(rand) - std) < std * 0.015
+
+            # test that random_normal also generates different values when used within a function
+            r = K.random_normal((10, 10), mean=mean, stddev=std, seed=1337)
+            samples = np.array([K.eval(r) for _ in range(200)])
+            assert np.abs(np.mean(samples) - mean) < std * 0.015
+            assert np.abs(np.std(samples) - std) < std * 0.015
 
     def test_random_uniform(self):
         min_val = -1.
         max_val = 1.
-        for k in BACKENDS:
-            rand = k.eval(k.random_uniform((200, 100), min_val, max_val))
-            assert rand.shape == (200, 100)
-            assert np.abs(np.mean(rand)) < 0.015
-            assert max_val - 0.015 < np.max(rand) <= max_val
-            assert min_val + 0.015 > np.min(rand) >= min_val
-
-            r = k.random_uniform((1,), minval=min_val, maxval=max_val)
-            samples = [k.eval(r) for _ in range(20000)]
-            assert np.abs(np.mean(samples)) < 0.015
-            assert max_val - 0.015 < np.max(samples) <= max_val
-            assert min_val + 0.015 > np.min(samples) >= min_val
+        rand = K.eval(K.random_uniform((200, 100), min_val, max_val))
+        assert rand.shape == (200, 100)
+        assert np.abs(np.mean(rand)) < 0.015
+        assert max_val - 0.015 < np.max(rand) <= max_val
+        assert min_val + 0.015 > np.min(rand) >= min_val
+
+        r = K.random_uniform((10, 10), minval=min_val, maxval=max_val)
+        samples = np.array([K.eval(r) for _ in range(200)])
+        assert np.abs(np.mean(samples)) < 0.015
+        assert max_val - 0.015 < np.max(samples) <= max_val
+        assert min_val + 0.015 > np.min(samples) >= min_val
 
     def test_random_binomial(self):
         p = 0.5
-        for k in BACKENDS:
-            rand = k.eval(k.random_binomial((200, 100), p))
-            assert rand.shape == (200, 100)
-            assert np.abs(np.mean(rand) - p) < 0.015
-            assert np.max(rand) == 1
-            assert np.min(rand) == 0
-
-            r = k.random_binomial((1,), p)
-            samples = [k.eval(r) for _ in range(20000)]
-            assert np.abs(np.mean(samples) - p) < 0.015
-            assert np.max(samples) == 1
-            assert np.min(samples) == 0
+        rand = K.eval(K.random_binomial((200, 100), p))
+        assert rand.shape == (200, 100)
+        assert np.abs(np.mean(rand) - p) < 0.015
+        assert np.max(rand) == 1
+        assert np.min(rand) == 0
+
+        r = K.random_binomial((10, 10), p)
+        samples = np.array([K.eval(r) for _ in range(200)])
+        assert np.abs(np.mean(samples) - p) < 0.015
+        assert np.max(samples) == 1
+        assert np.min(samples) == 0
 
     @pytest.mark.skipif(K.backend() == 'mxnet',
                         reason="MXNet backend does not support truncated normal yet.")
@@ -1248,15 +1338,14 @@ def test_truncated_normal(self):
         std = 1.
         min_val = -2.
         max_val = 2.
-        for k in BACKENDS:
-            rand = k.eval(k.truncated_normal((300, 200), mean=mean, stddev=std, seed=1337))
-            assert rand.shape == (300, 200)
-            assert np.abs(np.mean(rand) - mean) < 0.015
-            assert np.max(rand) <= max_val
-            assert np.min(rand) >= min_val
+        rand = K.eval(K.truncated_normal((300, 200), mean=mean, stddev=std, seed=1337))
+        assert rand.shape == (300, 200)
+        assert np.abs(np.mean(rand) - mean) < 0.015
+        assert np.max(rand) <= max_val
+        assert np.min(rand) >= min_val
 
-            # assumption in initializers.VarianceScaling
-            assert np.abs(np.std(rand) - std * 0.87962) < 0.015
+        # assumption in initializers.VarianceScaling
+        assert np.abs(np.std(rand) - std * 0.87962) < 0.015
 
     def test_conv_invalid_use(self):
         dummy_x_1d = K.variable(np.ones((4, 8, 2)))
@@ -1325,17 +1414,33 @@ def test_resize_images(self):
             elif data_format == 'channels_last':
                 x_shape = (2,) + shape + (3,)
             check_single_tensor_operation('resize_images', x_shape,
-                                          BACKENDS, cntk_dynamicity=True,
+                                          WITH_NP, cntk_dynamicity=True,
                                           height_factor=2,
                                           width_factor=2,
                                           data_format=data_format)
 
         # Test invalid use cases
         xval = np.random.random(x_shape)
-        for k in BACKENDS:
-            with pytest.raises(ValueError):
-                k.resize_images(k.variable(xval), 2, 2,
-                                data_format='channels_middle')
+        with pytest.raises(ValueError):
+            K.resize_images(K.variable(xval), 2, 2,
+                            data_format='channels_middle')
+
+    @staticmethod
+    def _helper_bilinear(data_format, height_factor, width_factor):
+        x_shape = (2, 3, 4, 5)
+        check_single_tensor_operation('resize_images', x_shape,
+                                      [KTF, KTH],
+                                      height_factor=height_factor,
+                                      width_factor=width_factor,
+                                      data_format=data_format,
+                                      interpolation='bilinear')
+
+    @pytest.mark.skipif(K.backend() == 'cntk', reason='Not supported.')
+    @pytest.mark.parametrize('data_format', ['channels_first', 'channels_last'])
+    def test_resize_images_bilinear(self, data_format):
+        self._helper_bilinear(data_format, 2, 2)
+        with pytest.raises(NotImplementedError):
+            self._helper_bilinear(data_format, 4, 4)
 
     def test_resize_volumes(self):
         for data_format in ['channels_first', 'channels_last']:
@@ -1345,7 +1450,7 @@ def test_resize_volumes(self):
             elif data_format == 'channels_last':
                 x_shape = (2,) + shape + (3,)
             check_single_tensor_operation('resize_volumes', x_shape,
-                                          BACKENDS, cntk_dynamicity=True,
+                                          WITH_NP, cntk_dynamicity=True,
                                           depth_factor=2,
                                           height_factor=2,
                                           width_factor=2,
@@ -1353,10 +1458,9 @@ def test_resize_volumes(self):
 
         # Test invalid use cases
         xval = np.random.random(x_shape)
-        for k in BACKENDS:
-            with pytest.raises(ValueError):
-                k.resize_volumes(k.variable(xval), 2, 2, 2,
-                                 data_format='channels_middle')
+        with pytest.raises(ValueError):
+            K.resize_volumes(K.variable(xval), 2, 2, 2,
+                             data_format='channels_middle')
 
     def test_temporal_padding(self):
         # MXNet does not support padding on 3D tensors yet.
@@ -1376,15 +1480,19 @@ def test_spatial_2d_padding(self):
             else:
                 # MXNet backend does not support channels_last padding yet.
                 x_shape = (1,) + shape + (3,)
-                check_single_tensor_operation('spatial_2d_padding', x_shape, BACKENDS_WITHOUT_MXNET,
-                                              padding=padding, data_format=data_format)
+            check_single_tensor_operation('spatial_2d_padding', x_shape, BACKENDS_WITHOUT_MXNET,
+                                          padding=padding, data_format=data_format)
+            # Check handling of dynamic shapes.
+            for k in [KTF, KTH]:
+                x = k.placeholder(shape=(1, None, None, 1))
+                y = k.spatial_2d_padding(x, padding=padding, data_format='channels_last')
+                assert k.int_shape(y) == (1, None, None, 1)
 
         # Test invalid use cases
         xval = np.random.random(x_shape)
-        for k in BACKENDS:
-            with pytest.raises(ValueError):
-                k.spatial_2d_padding(k.variable(xval), padding=padding,
-                                     data_format='channels_middle')
+        with pytest.raises(ValueError):
+            K.spatial_2d_padding(K.variable(xval), padding=padding,
+                                 data_format='channels_middle')
 
     def test_spatial_3d_padding(self):
         padding = ((1, 2), (2, 1), (1, 2))
@@ -1397,15 +1505,19 @@ def test_spatial_3d_padding(self):
             else:
                 # MXNet backend does not support channels_last padding yet.
                 x_shape = (1,) + shape + (3,)
-                check_single_tensor_operation('spatial_3d_padding', x_shape, BACKENDS_WITHOUT_MXNET,
-                                              padding=padding, data_format=data_format)
+            check_single_tensor_operation('spatial_3d_padding', x_shape, BACKENDS_WITHOUT_MXNET,
+                                          padding=padding, data_format=data_format)
+            # Check handling of dynamic shapes.
+            for k in [KTF, KTH]:
+                x = k.placeholder(shape=(1, None, None, None, 1))
+                y = k.spatial_3d_padding(x, padding=padding, data_format='channels_last')
+                assert k.int_shape(y) == (1, None, None, None, 1)
 
         # Test invalid use cases
         xval = np.random.random(x_shape)
-        for k in BACKENDS:
-            with pytest.raises(ValueError):
-                k.spatial_3d_padding(k.variable(xval), padding=padding,
-                                     data_format='channels_middle')
+        with pytest.raises(ValueError):
+            K.spatial_3d_padding(K.variable(xval), padding=padding,
+                                 data_format='channels_middle')
 
     def test_bias_add(self):
         for data_format in ['channels_first', 'channels_last']:
@@ -1416,7 +1528,7 @@ def test_bias_add(self):
                     x_shape = (1,) + shape + (4,)
                 bias_shape = (4,)
                 check_two_tensor_operation('bias_add', x_shape, bias_shape,
-                                           BACKENDS, cntk_dynamicity=True,
+                                           WITH_NP, cntk_dynamicity=True,
                                            data_format=data_format)
 
             if data_format == 'channels_first':
@@ -1424,15 +1536,14 @@ def test_bias_add(self):
             else:
                 x_shape = (20, 10, 6)
             check_two_tensor_operation('bias_add', x_shape, (10, 6),
-                                       BACKENDS, cntk_dynamicity=True,
+                                       WITH_NP, cntk_dynamicity=True,
                                        data_format=data_format)
 
         # Test invalid use cases
-        for k in BACKENDS:
-            x = k.variable(np.random.random(x_shape))
-            b = k.variable(np.random.random(bias_shape))
-            with pytest.raises(ValueError):
-                k.bias_add(x, b, data_format='channels_middle')
+        x = K.variable(np.random.random(x_shape))
+        b = K.variable(np.random.random(bias_shape))
+        with pytest.raises(ValueError):
+            K.bias_add(x, b, data_format='channels_middle')
 
     @pytest.mark.skipif(K.backend() == 'mxnet',
                         reason="MXNet backend use MXNet native batchnorm. To be fixed.")
@@ -1522,10 +1633,9 @@ def test_ctc(self):
         res = K.eval(K.ctc_batch_cost(k_labels, k_inputs, k_input_lens, k_label_lens))
         assert_allclose(res[0, :] if K.backend() == 'theano' else res[:, 0], ref, atol=1e-05)
 
-    '''only tensorflow tested, need special handle'''
-
+    @pytest.mark.skipif(K.backend() != 'tensorflow',
+                        reason='Test adapted from tensorflow.')
     def test_ctc_decode_greedy(self):
-        # Test adapted from tensorflow
         """Test two batch entries - best path decoder."""
         max_time_steps = 6
 
@@ -1558,9 +1668,9 @@ def test_ctc_decode_greedy(self):
                   for t in range(max_time_steps)]
 
         # change tensorflow order to keras backend order
-        inputs = KTF.variable(np.asarray(inputs).transpose((1, 0, 2)))
+        inputs = K.variable(np.asarray(inputs).transpose((1, 0, 2)))
         # batch_size length vector of sequence_lengths
-        input_length = KTF.variable(np.array([seq_len_0, seq_len_1], dtype=np.int32))
+        input_length = K.variable(np.array([seq_len_0, seq_len_1], dtype=np.int32))
 
         # batch_size length vector of negative log probabilities
         log_prob_truth = np.array([
@@ -1571,20 +1681,21 @@ def test_ctc_decode_greedy(self):
         # keras output, unlike tensorflow, is a dense (not sparse) tensor
         decode_truth = np.array([[0, 1, -1], [1, 1, 0]])
 
-        decode_pred_tf, log_prob_pred_tf = KTF.ctc_decode(inputs,
-                                                          input_length,
-                                                          greedy=True)
+        decode_pred_tf, log_prob_pred_tf = K.ctc_decode(inputs,
+                                                        input_length,
+                                                        greedy=True)
 
         assert len(decode_pred_tf) == 1
 
-        decode_pred = KTF.eval(decode_pred_tf[0])
-        log_prob_pred = KTF.eval(log_prob_pred_tf)
+        decode_pred = K.eval(decode_pred_tf[0])
+        log_prob_pred = K.eval(log_prob_pred_tf)
 
         assert np.alltrue(decode_truth == decode_pred)
         assert np.allclose(log_prob_truth, log_prob_pred)
 
-    '''tensorflow only, need special handle'''
-
+    @pytest.mark.skipif(K.backend() != 'tensorflow',
+                        reason='Beam search is only implemented with '
+                               'the TensorFlow backend.')
     def test_ctc_decode_beam_search(self):
         """Test one batch, two beams - hibernating beam search."""
 
@@ -1606,10 +1717,10 @@ def test_ctc_decode_beam_search(self):
                    for t in range(seq_len_0)] +  # Pad to max_time_steps = 8
                   2 * [np.zeros((1, depth), dtype=np.float32)])
 
-        inputs = KTF.variable(np.asarray(inputs).transpose((1, 0, 2)))
+        inputs = K.variable(np.asarray(inputs).transpose((1, 0, 2)))
 
         # batch_size length vector of sequence_lengths
-        input_length = KTF.variable(np.array([seq_len_0], dtype=np.int32))
+        input_length = K.variable(np.array([seq_len_0], dtype=np.int32))
         # batch_size length vector of negative log probabilities
         log_prob_truth = np.array([
             0.584855,  # output beam 0
@@ -1621,18 +1732,18 @@ def test_ctc_decode_beam_search(self):
         beam_width = 2
         top_paths = 2
 
-        decode_pred_tf, log_prob_pred_tf = KTF.ctc_decode(inputs,
-                                                          input_length,
-                                                          greedy=False,
-                                                          beam_width=beam_width,
-                                                          top_paths=top_paths)
+        decode_pred_tf, log_prob_pred_tf = K.ctc_decode(inputs,
+                                                        input_length,
+                                                        greedy=False,
+                                                        beam_width=beam_width,
+                                                        top_paths=top_paths)
 
         assert len(decode_pred_tf) == top_paths
 
-        log_prob_pred = KTF.eval(log_prob_pred_tf)
+        log_prob_pred = K.eval(log_prob_pred_tf)
 
         for i in range(top_paths):
-            assert np.alltrue(decode_truth[i] == KTF.eval(decode_pred_tf[i]))
+            assert np.alltrue(decode_truth[i] == K.eval(decode_pred_tf[i]))
 
         assert np.allclose(log_prob_truth, log_prob_pred)
 
@@ -1642,10 +1753,11 @@ def test_one_hot(self):
         batch_size = 30
         indices = np.random.randint(0, num_classes, size=(batch_size, input_length))
         oh = np.eye(num_classes)[indices]
-        for k in BACKENDS:
-            koh = k.eval(k.one_hot(k.variable(indices, dtype='int32'), num_classes))
-            assert np.all(koh == oh)
+        koh = K.eval(K.one_hot(K.variable(indices, dtype='int32'), num_classes))
+        assert np.all(koh == oh)
 
+    @pytest.mark.skipif(K.backend() == 'cntk',
+                        reason='Sparse tensors are not supported in cntk.')
     def test_sparse_dot(self):
         x_d = np.array([0, 7, 2, 3], dtype=np.float32)
         x_r = np.array([0, 2, 2, 3], dtype=np.int64)
@@ -1744,12 +1856,13 @@ def test_foldr(self):
         assert p1 < p2
         assert 9e-38 < p2 <= 1e-37
 
+    @pytest.mark.skipif(K.backend() == 'cntk' or K.backend() == 'mxnet',
+                        reason='cntk and mxnet has issues with negative number.')
     def test_arange(self):
         for test_value in (-20, 0, 1, 10):
             a_list = []
             dtype_list = []
-            # cntk has issue with negative number
-            for k in [KTH, KTF]:
+            for k in WITH_NP:
                 t = k.arange(test_value)
                 a = k.eval(t)
                 assert np.array_equal(a, np.arange(test_value))
@@ -1761,7 +1874,7 @@ def test_arange(self):
 
         for start, stop, step in ((0, 5, 1), (-5, 5, 2), (0, 1, 2)):
             a_list = []
-            for k in [KTH, KTF]:
+            for k in WITH_NP:
                 a = k.eval(k.arange(start, stop, step))
                 assert np.array_equal(a, np.arange(start, stop, step))
                 a_list.append(a)
@@ -1769,11 +1882,11 @@ def test_arange(self):
                 assert np.array_equal(a_list[i], a_list[i + 1])
 
         for dtype in ('int32', 'int64', 'float32', 'float64'):
-            for k in [KTH, KTF]:
+            for k in WITH_NP:
                 t = k.arange(10, dtype=dtype)
                 assert k.dtype(t) == dtype
 
-        for k in [KTH, KTF]:
+        for k in WITH_NP:
             start = k.constant(1, dtype='int32')
             t = k.arange(start)
             assert len(k.eval(t)) == 1
@@ -1782,19 +1895,19 @@ def test_arange(self):
             t = k.arange(start)
             assert len(k.eval(t)) == 0
 
-    def test_in_train_phase(self):
-        for training in [True, False]:
-            check_two_tensor_operation('in_train_phase', (3, 3), (2, 2), [KTH, KTF],
-                                       training=training)
-            check_two_tensor_operation('in_train_phase', (2, 3), (2, 3), BACKENDS,
-                                       training=training)
-
-    def test_in_test_phase(self):
-        for training in [True, False]:
-            check_two_tensor_operation('in_test_phase', (3, 3), (2, 2), [KTH, KTF],
-                                       training=training)
-            check_two_tensor_operation('in_test_phase', (2, 3), (2, 3), BACKENDS,
-                                       training=training)
+    @pytest.mark.parametrize('training', [True, False])
+    def test_in_train_phase(self, training):
+        check_two_tensor_operation('in_train_phase', (3, 3), (2, 2), WITH_NP,
+                                   training=training)
+        check_two_tensor_operation('in_train_phase', (2, 3), (2, 3), WITH_NP,
+                                   training=training)
+
+    @pytest.mark.parametrize('training', [True, False])
+    def test_in_test_phase(self, training):
+        check_two_tensor_operation('in_test_phase', (3, 3), (2, 2), WITH_NP,
+                                   training=training)
+        check_two_tensor_operation('in_test_phase', (2, 3), (2, 3), WITH_NP,
+                                   training=training)
 
     def test_setfloatx_incorrect_values(self):
         # Keep track of the old value
diff --git a/tests/keras/backend/reference_operations.py b/tests/keras/backend/reference_operations.py
index 75cfe7956f0..e6606269bca 100644
--- a/tests/keras/backend/reference_operations.py
+++ b/tests/keras/backend/reference_operations.py
@@ -5,31 +5,41 @@
 
 import numpy as np
 import scipy.signal as signal
+import scipy as sp
+from keras.backend import floatx
 
 
 def normalize_conv(func):
-    def wrapper(*args):
+    def wrapper(*args, **kwargs):
         x = args[0]
         w = args[1]
         if x.ndim == 3:
             w = np.flipud(w)
             w = np.transpose(w, (1, 2, 0))
-            if args[3] == 'channels_last':
+            if kwargs['data_format'] == 'channels_last':
                 x = np.transpose(x, (0, 2, 1))
         elif x.ndim == 4:
             w = np.fliplr(np.flipud(w))
             w = np.transpose(w, (2, 3, 0, 1))
-            if args[3] == 'channels_last':
+            if kwargs['data_format'] == 'channels_last':
                 x = np.transpose(x, (0, 3, 1, 2))
         else:
             w = np.flip(np.fliplr(np.flipud(w)), axis=2)
             w = np.transpose(w, (3, 4, 0, 1, 2))
-            if args[3] == 'channels_last':
+            if kwargs['data_format'] == 'channels_last':
                 x = np.transpose(x, (0, 4, 1, 2, 3))
 
-        y = func(x, w, args[2], args[3])
+        dilation_rate = kwargs.pop('dilation_rate', 1)
+        if isinstance(dilation_rate, int):
+            dilation_rate = (dilation_rate,) * (x.ndim - 2)
+        for (i, d) in enumerate(dilation_rate):
+            if d > 1:
+                for j in range(w.shape[2 + i] - 1):
+                    w = np.insert(w, 2 * j + 1, 0, axis=2 + i)
 
-        if args[3] == 'channels_last':
+        y = func(x, w, **kwargs)
+
+        if kwargs['data_format'] == 'channels_last':
             if y.ndim == 3:
                 y = np.transpose(y, (0, 2, 1))
             elif y.ndim == 4:
@@ -73,8 +83,36 @@ def depthwise_conv(x, w, padding, data_format):
 
 
 def separable_conv(x, w1, w2, padding, data_format):
-    x2 = depthwise_conv(x, w1, padding, data_format)
-    return conv(x2, w2, padding, data_format)
+    x2 = depthwise_conv(x, w1, padding=padding, data_format=data_format)
+    return conv(x2, w2, padding=padding, data_format=data_format)
+
+
+def conv_transpose(x, w, output_shape, padding, data_format, dilation_rate=1):
+    if x.ndim == 4:
+        w = np.fliplr(np.flipud(w))
+        w = np.transpose(w, (0, 1, 3, 2))
+    else:
+        w = np.flip(np.fliplr(np.flipud(w)), axis=2)
+        w = np.transpose(w, (0, 1, 2, 4, 3))
+
+    if isinstance(dilation_rate, int):
+        dilation_rate = (dilation_rate,) * (x.ndim - 2)
+    for (i, d) in enumerate(dilation_rate):
+        if d > 1:
+            for j in range(w.shape[i] - 1):
+                w = np.insert(w, 2 * j + 1, 0, axis=i)
+
+    return conv(x, w, padding=padding, data_format=data_format)
+
+
+conv1d = conv
+conv2d = conv
+conv3d = conv
+depthwise_conv2d = depthwise_conv
+separable_conv1d = separable_conv
+separable_conv2d = separable_conv
+conv2d_transpose = conv_transpose
+conv3d_transpose = conv_transpose
 
 
 def pool(x, pool_size, strides, padding, data_format, pool_mode):
@@ -107,7 +145,11 @@ def pool(x, pool_size, strides, padding, data_format, pool_mode):
         for (k, k1) in zip(range(pool_size[0]), range(-pool_size[0], 0)):
             for (l, l1) in zip(range(pool_size[1]), range(-pool_size[1], 0)):
                 for (m, m1) in zip(range(pool_size[2]), range(-pool_size[2], 0)):
-                    y.append(x[:, :, k:k1:strides[0], l:l1:strides[1], m:m1:strides[2]])
+                    y.append(x[:,
+                               :,
+                               k:k1:strides[0],
+                               l:l1:strides[1],
+                               m:m1:strides[2]])
     y = np.stack(y, axis=-1)
     if pool_mode == 'avg':
         y = np.mean(np.ma.masked_invalid(y), axis=-1).data
@@ -125,6 +167,22 @@ def pool(x, pool_size, strides, padding, data_format, pool_mode):
     return y
 
 
+pool2d = pool
+pool3d = pool
+
+
+def bias_add(x, y, data_format):
+    if data_format == 'channels_first':
+        if y.ndim > 1:
+            y = np.reshape(y, y.shape[::-1])
+        for _ in range(x.ndim - y.ndim - 1):
+            y = np.expand_dims(y, -1)
+    else:
+        for _ in range(x.ndim - y.ndim - 1):
+            y = np.expand_dims(y, 0)
+    return x + y
+
+
 def rnn(x, w, init, go_backwards=False, mask=None, unroll=False, input_length=None):
     w_i, w_h, w_o = w
     h = []
@@ -162,3 +220,372 @@ def rnn(x, w, init, go_backwards=False, mask=None, unroll=False, input_length=No
         h.append(h_t + h_t1)
 
     return o[-1], np.stack(o, axis=1), np.stack(h, axis=1)
+
+
+_LEARNING_PHASE = True
+
+
+def learning_phase():
+    return _LEARNING_PHASE
+
+
+def set_learning_phase(value):
+    global _LEARNING_PHASE
+    _LEARNING_PHASE = value
+
+
+def in_train_phase(x, alt, training=None):
+    if training is None:
+        training = learning_phase()
+
+    if training is 1 or training is True:
+        if callable(x):
+            return x()
+        else:
+            return x
+    else:
+        if callable(alt):
+            return alt()
+        else:
+            return alt
+
+
+def in_test_phase(x, alt, training=None):
+    return in_train_phase(alt, x, training=training)
+
+
+def relu(x, alpha=0., max_value=None, threshold=0.):
+    y = x * (x >= threshold)
+    if max_value is not None:
+        y = np.clip(y, 0.0, max_value)
+    y += alpha * (x - threshold) * (x < threshold)
+    return y
+
+
+def switch(condition, then_expression, else_expression):
+    cond_float = condition.astype(floatx())
+    while cond_float.ndim < then_expression.ndim:
+        cond_float = cond_float[..., None]
+    return cond_float * then_expression + (1 - cond_float) * else_expression
+
+
+def softplus(x):
+    return np.log(1. + np.exp(x))
+
+
+def elu(x, alpha=1.):
+    return x * (x > 0) + alpha * (np.exp(x) - 1.) * (x < 0)
+
+
+def sigmoid(x):
+    return 1. / (1. + np.exp(-x))
+
+
+def hard_sigmoid(x):
+    y = 0.2 * x + 0.5
+    y = np.minimum(y, 1.)
+    y = np.maximum(y, 0.)
+    return y
+
+
+def tanh(x):
+    return np.tanh(x)
+
+
+def softmax(x, axis=-1):
+    y = np.exp(x - np.max(x, axis, keepdims=True))
+    return y / np.sum(y, axis, keepdims=True)
+
+
+def l2_normalize(x, axis=-1):
+    y = np.max(np.sum(x ** 2, axis, keepdims=True), axis, keepdims=True)
+    return x / np.sqrt(y)
+
+
+def binary_crossentropy(target, output, from_logits=False):
+    if not from_logits:
+        output = np.clip(output, 1e-7, 1 - 1e-7)
+        output = np.log(output / (1 - output))
+    return (target * -np.log(sigmoid(output)) +
+            (1 - target) * -np.log(1 - sigmoid(output)))
+
+
+def categorical_crossentropy(target, output, from_logits=False):
+    if from_logits:
+        output = softmax(output)
+    else:
+        output /= output.sum(axis=-1, keepdims=True)
+    output = np.clip(output, 1e-7, 1 - 1e-7)
+    return np.sum(target * -np.log(output), axis=-1, keepdims=False)
+
+
+def max(x, axis=None, keepdims=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    return np.max(x, axis=axis, keepdims=keepdims)
+
+
+def min(x, axis=None, keepdims=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    return np.min(x, axis=axis, keepdims=keepdims)
+
+
+def mean(x, axis=None, keepdims=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    return np.mean(x, axis=axis, keepdims=keepdims)
+
+
+def var(x, axis=None, keepdims=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    return np.var(x, axis=axis, keepdims=keepdims)
+
+
+def std(x, axis=None, keepdims=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    return np.std(x, axis=axis, keepdims=keepdims)
+
+
+def logsumexp(x, axis=None, keepdims=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    return sp.misc.logsumexp(x, axis=axis, keepdims=keepdims)
+
+
+def sum(x, axis=None, keepdims=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    return np.sum(x, axis=axis, keepdims=keepdims)
+
+
+def prod(x, axis=None, keepdims=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    return np.prod(x, axis=axis, keepdims=keepdims)
+
+
+def cumsum(x, axis=0):
+    return np.cumsum(x, axis=axis)
+
+
+def cumprod(x, axis=0):
+    return np.cumprod(x, axis=axis)
+
+
+def any(x, axis=None, keepdims=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    return np.any(x, axis=axis, keepdims=keepdims)
+
+
+def all(x, axis=None, keepdims=False):
+    if isinstance(axis, list):
+        axis = tuple(axis)
+    return np.all(x, axis=axis, keepdims=keepdims)
+
+
+def argmax(x, axis=-1):
+    return np.argmax(x, axis=axis)
+
+
+def argmin(x, axis=-1):
+    return np.argmin(x, axis=axis)
+
+
+def sqrt(x):
+    y = np.sqrt(x)
+    y[np.isnan(y)] = 0.
+    return y
+
+
+def pow(x, a=1.):
+    return np.power(x, a)
+
+
+def clip(x, min_value, max_value):
+    return np.clip(x, min_value, max_value)
+
+
+def concatenate(tensors, axis=-1):
+    return np.concatenate(tensors, axis)
+
+
+def permute_dimensions(x, pattern):
+    return np.transpose(x, pattern)
+
+
+def reshape(x, shape):
+    return np.reshape(x, shape)
+
+
+def repeat_elements(x, rep, axis):
+    return np.repeat(x, rep, axis=axis)
+
+
+def repeat(x, n):
+    y = np.expand_dims(x, 1)
+    y = np.repeat(y, n, axis=1)
+    return y
+
+
+def tile(x, n):
+    return np.tile(x, n)
+
+
+def arange(start, stop=None, step=1, dtype='int32'):
+    return np.arange(start, stop, step, dtype)
+
+
+def flatten(x):
+    return np.reshape(x, (-1,))
+
+
+def batch_flatten(x):
+    return np.reshape(x, (x.shape[0], -1))
+
+
+def eval(x):
+    return x
+
+
+def dtype(x):
+    return x.dtype.name
+
+
+def constant(value, dtype=None, shape=None, name=None):
+    if dtype is None:
+        dtype = floatx()
+    if shape is None:
+        shape = ()
+    np_value = value * np.ones(shape)
+    np_value.astype(dtype)
+    return np_value
+
+
+def print_tensor(x, message=''):
+    print(x, message)
+    return x
+
+
+def dot(x, y):
+    return np.dot(x, y)
+
+
+def transpose(x):
+    return np.transpose(x)
+
+
+def reverse(x, axes):
+    if isinstance(axes, int):
+        axes = [axes]
+    for a in axes:
+        x = np.flip(x, a)
+    return x
+
+
+def variable(value, dtype=None, name=None, constraint=None):
+    if constraint is not None:
+        raise TypeError("Constraint must be None when "
+                        "using the NumPy backend.")
+    return np.array(value, dtype)
+
+
+def equal(x, y):
+    return x == y
+
+
+def not_equal(x, y):
+    return x != y
+
+
+def greater(x, y):
+    return x > y
+
+
+def greater_equal(x, y):
+    return x >= y
+
+
+def less(x, y):
+    return x < y
+
+
+def less_equal(x, y):
+    return x <= y
+
+
+def maximum(x, y):
+    return np.maximum(x, y)
+
+
+def minimum(x, y):
+    return np.minimum(x, y)
+
+
+def ndim(x):
+    return x.ndim
+
+
+def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
+    return (high - low) * np.random.random(shape).astype(dtype) + low
+
+
+def random_normal_variable(shape, mean, scale, dtype=None, name=None, seed=None):
+    return scale * np.random.randn(*shape).astype(dtype) + mean
+
+
+def zeros(shape, dtype=floatx(), name=None):
+    return np.zeros(shape, dtype=dtype)
+
+
+def zeros_like(x, dtype=floatx(), name=None):
+    return np.zeros_like(x, dtype=dtype)
+
+
+def ones(shape, dtype=floatx(), name=None):
+    return np.ones(shape, dtype=dtype)
+
+
+def ones_like(x, dtype=floatx(), name=None):
+    return np.ones_like(x, dtype=dtype)
+
+
+def eye(size, dtype=None, name=None):
+    return np.eye(size, dtype=dtype)
+
+
+def resize_images(x, height_factor, width_factor, data_format):
+    if data_format == 'channels_first':
+        x = repeat_elements(x, height_factor, axis=2)
+        x = repeat_elements(x, width_factor, axis=3)
+    elif data_format == 'channels_last':
+        x = repeat_elements(x, height_factor, axis=1)
+        x = repeat_elements(x, width_factor, axis=2)
+    return x
+
+
+def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
+    if data_format == 'channels_first':
+        x = repeat_elements(x, depth_factor, axis=2)
+        x = repeat_elements(x, height_factor, axis=3)
+        x = repeat_elements(x, width_factor, axis=4)
+    elif data_format == 'channels_last':
+        x = repeat_elements(x, depth_factor, axis=1)
+        x = repeat_elements(x, height_factor, axis=2)
+        x = repeat_elements(x, width_factor, axis=3)
+    return x
+
+
+square = np.square
+abs = np.abs
+exp = np.exp
+log = np.log
+round = np.round
+sign = np.sign
+expand_dims = np.expand_dims
+squeeze = np.squeeze
+cos = np.cos
+sin = np.sin
diff --git a/tests/keras/constraints_test.py b/tests/keras/constraints_test.py
index de4a1974754..c08175ac9c9 100644
--- a/tests/keras/constraints_test.py
+++ b/tests/keras/constraints_test.py
@@ -4,7 +4,6 @@
 
 from keras import backend as K
 from keras import constraints
-from keras.utils.test_utils import keras_test
 
 
 def get_test_values():
@@ -30,7 +29,6 @@ def test_serialization():
         assert fn.__class__ == ref_fn.__class__
 
 
-@keras_test
 def test_max_norm():
     array = get_example_array()
     for m in get_test_values():
@@ -50,14 +48,12 @@ def test_max_norm():
     assert_allclose(x_normed_actual, x_normed_target, rtol=1e-05)
 
 
-@keras_test
 def test_non_neg():
     non_neg_instance = constraints.non_neg()
     normed = non_neg_instance(K.variable(get_example_array()))
     assert(np.all(np.min(K.eval(normed), axis=1) == 0.))
 
 
-@keras_test
 def test_unit_norm():
     unit_norm_instance = constraints.unit_norm()
     normalized = unit_norm_instance(K.variable(get_example_array()))
@@ -68,7 +64,6 @@ def test_unit_norm():
     assert(np.abs(largest_difference) < 10e-5)
 
 
-@keras_test
 def test_min_max_norm():
     array = get_example_array()
     for m in get_test_values():
diff --git a/tests/keras/engine/test_topology.py b/tests/keras/engine/test_topology.py
index a15bea5b686..a0fff1527a3 100644
--- a/tests/keras/engine/test_topology.py
+++ b/tests/keras/engine/test_topology.py
@@ -8,26 +8,15 @@
 from keras.models import Model, Sequential
 from keras import backend as K
 from keras.models import model_from_json, model_from_yaml
-from keras.utils.test_utils import keras_test
 from keras.initializers import Constant
 
 
 skipif_no_tf_gpu = pytest.mark.skipif(
-    (K.backend() != 'tensorflow') or (not K.tensorflow_backend._get_available_gpus()),
+    (K.backend() != 'tensorflow' or
+     not K.tensorflow_backend._get_available_gpus()),
     reason='Requires TensorFlow backend and a GPU')
 
 
-skipif_no_tf_gpu = pytest.mark.skipif(
-    (K.backend() != 'tensorflow') or (not K.tensorflow_backend._get_available_gpus()),
-    reason='Requires TensorFlow backend and a GPU')
-
-
-skipif_no_tf_gpu = pytest.mark.skipif(
-    (K.backend() != 'tensorflow') or (not K.tensorflow_backend._get_available_gpus()),
-    reason='Requires TensorFlow backend and a GPU')
-
-
-@keras_test
 def test_get_updates_for():
     a = Input(shape=(2,))
     dense_layer = Dense(1)
@@ -38,7 +27,6 @@ def test_get_updates_for():
     assert dense_layer.get_updates_for(None) == [1]
 
 
-@keras_test
 def test_get_losses_for():
     a = Input(shape=(2,))
     dense_layer = Dense(1)
@@ -49,7 +37,6 @@ def test_get_losses_for():
     assert dense_layer.get_losses_for(None) == [1]
 
 
-@keras_test
 def test_trainable_weights():
     a = Input(shape=(2,))
     b = Dense(1)(a)
@@ -164,7 +151,6 @@ def test_learning_phase():
     assert fn_outputs_no_dp[1].sum() != fn_outputs_dp[1].sum()
 
 
-@keras_test
 def test_layer_call_arguments():
     # Test the ability to pass and serialize arguments to `call`.
     inp = layers.Input(shape=(2,))
@@ -184,7 +170,6 @@ def test_layer_call_arguments():
     assert not model.uses_learning_phase
 
 
-@keras_test
 def test_node_construction():
     ####################################################
     # test basics
@@ -267,7 +252,6 @@ def test_node_construction():
     assert dense.get_output_mask_at(1) is None
 
 
-@keras_test
 def test_multi_input_layer():
     ####################################################
     # test multi-input layer
@@ -296,13 +280,15 @@ def test_multi_input_layer():
 
     model = Model(inputs=[a, b], outputs=[c, d], name='model')
     assert len(model.layers) == 6
-    assert model.compute_output_shape([(None, 32), (None, 32)]) == [(None, 64), (None, 5)]
+    expected_shapes = [(None, 64), (None, 5)]
+    assert model.compute_output_shape([(None, 32), (None, 32)]) == expected_shapes
     assert model.compute_mask([a, b], [None, None]) == [None, None]
-    assert model.compute_output_shape([(None, 32), (None, 32)]) == [(None, 64), (None, 5)]
+    assert model.compute_output_shape([(None, 32), (None, 32)]) == expected_shapes
 
     # we don't check names of first 2 layers (inputs) because
     # ordering of same-level layers is not fixed
-    assert [l.name for l in model.layers][2:] == ['dense_1', 'merge', 'dense_2', 'dense_3']
+    expected_names = ['dense_1', 'merge', 'dense_2', 'dense_3']
+    assert [l.name for l in model.layers][2:] == expected_names
     assert [l.name for l in model._input_layers] == ['input_a', 'input_b']
     assert [l.name for l in model._output_layers] == ['dense_2', 'dense_3']
 
@@ -321,7 +307,7 @@ def test_multi_input_layer():
     recreated_model = model_from_json(json_config)
     recreated_model.compile('rmsprop', 'mse')
 
-    assert [l.name for l in recreated_model.layers][2:] == ['dense_1', 'merge', 'dense_2', 'dense_3']
+    assert [l.name for l in recreated_model.layers][2:] == expected_names
     assert [l.name for l in recreated_model._input_layers] == ['input_a', 'input_b']
     assert [l.name for l in recreated_model._output_layers] == ['dense_2', 'dense_3']
 
@@ -332,7 +318,6 @@ def test_multi_input_layer():
     assert [x.shape for x in fn_outputs] == [(10, 64), (10, 5)]
 
 
-@keras_test
 def test_recursion():
     ####################################################
     # test recursion
@@ -368,9 +353,10 @@ def test_recursion():
 
     # we don't check names of first 2 layers (inputs) because
     # ordering of same-level layers is not fixed
+    expected_shapes = [(10, 7), (10, 64)]
     assert [layer.name for layer in final_model.layers][2:] == ['model', 'dense_4']
     assert model.compute_mask([e, f], [None, None]) == [None, None]
-    assert final_model.compute_output_shape([(10, 32), (10, 32)]) == [(10, 7), (10, 64)]
+    assert final_model.compute_output_shape([(10, 32), (10, 32)]) == expected_shapes
 
     # run recursive model
     fn = K.function(final_model.inputs, final_model.outputs)
@@ -513,9 +499,9 @@ def test_recursion():
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support Bidirectional')
-@keras_test
 def test_load_layers():
-    from keras.layers import ConvLSTM2D, TimeDistributed, Bidirectional, Conv2D, Input
+    from keras.layers import ConvLSTM2D, TimeDistributed
+    from keras.layers import Bidirectional, Conv2D, Input
     from keras.models import Model
 
     if K.backend() == 'tensorflow' or K.backend() == 'cntk':
@@ -523,13 +509,14 @@ def test_load_layers():
     else:
         inputs = Input(shape=(10, 1, 20, 20))
     td_conv = TimeDistributed(Conv2D(15, (5, 5)))(inputs)
-    bi_convlstm2d = Bidirectional(ConvLSTM2D(10, (3, 3)), merge_mode='concat')(td_conv)
-    model = Model(inputs=inputs, outputs=bi_convlstm2d)
+    bi_conv = Bidirectional(ConvLSTM2D(10, (3, 3)), merge_mode='concat')(td_conv)
+    model = Model(inputs=inputs, outputs=bi_conv)
 
     weight_value_tuples = []
 
     # TimeDistributed Conv2D layer
-    # use 'channels_first' data format to check that the function is being called correctly for Conv2D
+    # use 'channels_first' data format to check that
+    # the function is being called correctly for Conv2D
     # old: (filters, stack_size, kernel_rows, kernel_cols)
     # new: (kernel_rows, kernel_cols, stack_size, filters)
     weight_tensor_td_conv_old = list()
@@ -546,34 +533,35 @@ def test_load_layers():
     weight_value_tuples += zip(symbolic_weights, weight_tensor_td_conv_new)
 
     # Bidirectional ConvLSTM2D layer
-    # old ConvLSTM2D took a list of 12 weight tensors, returns a list of 3 concatenated larger tensors.
-    weight_tensor_bi_convlstm_old = []
+    # old ConvLSTM2D took a list of 12 weight tensors,
+    # returns a list of 3 concatenated larger tensors.
+    weights_bi_conv_old = []
     for j in range(2):  # bidirectional
         for i in range(4):
-            weight_tensor_bi_convlstm_old.append(np.zeros((3, 3, 15, 10)))  # kernel
-            weight_tensor_bi_convlstm_old.append(np.zeros((3, 3, 10, 10)))  # recurrent kernel
-            weight_tensor_bi_convlstm_old.append(np.zeros((10,)))  # bias
+            weights_bi_conv_old.append(np.zeros((3, 3, 15, 10)))  # kernel
+            weights_bi_conv_old.append(np.zeros((3, 3, 10, 10)))  # recurrent kernel
+            weights_bi_conv_old.append(np.zeros((10,)))  # bias
 
     bi_convlstm_layer = model.layers[2]
-    weight_tensor_bi_convlstm_new = saving.preprocess_weights_for_loading(
+    weights_bi_conv_new = saving.preprocess_weights_for_loading(
         bi_convlstm_layer,
-        weight_tensor_bi_convlstm_old,
+        weights_bi_conv_old,
         original_keras_version='1')
 
     symbolic_weights = bi_convlstm_layer.weights
-    assert (len(symbolic_weights) == len(weight_tensor_bi_convlstm_new))
-    weight_value_tuples += zip(symbolic_weights, weight_tensor_bi_convlstm_new)
+    assert (len(symbolic_weights) == len(weights_bi_conv_new))
+    weight_value_tuples += zip(symbolic_weights, weights_bi_conv_new)
 
     K.batch_set_value(weight_value_tuples)
 
     assert np.all(K.eval(model.layers[1].weights[0]) == weight_tensor_td_conv_new[0])
     assert np.all(K.eval(model.layers[1].weights[1]) == weight_tensor_td_conv_new[1])
-    assert np.all(K.eval(model.layers[2].weights[0]) == weight_tensor_bi_convlstm_new[0])
-    assert np.all(K.eval(model.layers[2].weights[1]) == weight_tensor_bi_convlstm_new[1])
-    assert np.all(K.eval(model.layers[2].weights[2]) == weight_tensor_bi_convlstm_new[2])
-    assert np.all(K.eval(model.layers[2].weights[3]) == weight_tensor_bi_convlstm_new[3])
-    assert np.all(K.eval(model.layers[2].weights[4]) == weight_tensor_bi_convlstm_new[4])
-    assert np.all(K.eval(model.layers[2].weights[5]) == weight_tensor_bi_convlstm_new[5])
+    assert np.all(K.eval(model.layers[2].weights[0]) == weights_bi_conv_new[0])
+    assert np.all(K.eval(model.layers[2].weights[1]) == weights_bi_conv_new[1])
+    assert np.all(K.eval(model.layers[2].weights[2]) == weights_bi_conv_new[2])
+    assert np.all(K.eval(model.layers[2].weights[3]) == weights_bi_conv_new[3])
+    assert np.all(K.eval(model.layers[2].weights[4]) == weights_bi_conv_new[4])
+    assert np.all(K.eval(model.layers[2].weights[5]) == weights_bi_conv_new[5])
 
 
 def convert_weights(layer, weights):
@@ -590,16 +578,16 @@ def convert_weights(layer, weights):
     return weights
 
 
-@pytest.mark.skipif(K.backend() == 'mxnet',
-                    reason='MXNet backend does not support RNN')
-@keras_test
+# MXNet does not support ConvLSTM2D yet
+# @pytest.mark.parametrize("layer", [
+#     layers.ConvLSTM2D(5, (3, 3),
+#                       input_shape=[6, 6, 6, 6],
+#                       data_format='channels_first'),
+# ], ids=['ConvLSTM2D'])
 @pytest.mark.parametrize("layer", [
     layers.GRU(2, input_shape=[3, 5]),
     layers.LSTM(2, input_shape=[3, 5]),
-    layers.ConvLSTM2D(5, (3, 3),
-                      input_shape=[6, 6, 6, 6],
-                      data_format='channels_first'),
-], ids=['GRU', 'LSTM', 'ConvLSTM2D'])
+], ids=['GRU', 'LSTM'])
 def test_preprocess_weights_for_loading(layer):
     # A model is needed to initialize weights.
     _ = Sequential([layer])
@@ -611,7 +599,6 @@ def test_preprocess_weights_for_loading(layer):
                 for (x, y) in zip(weights1, weights2)])
 
 
-@keras_test
 @pytest.mark.parametrize("layer", [
     layers.Conv2D(2, (3, 3), input_shape=[5, 5, 3]),
     layers.Conv2DTranspose(2, (5, 5),
@@ -630,18 +617,17 @@ def test_preprocess_weights_for_loading_for_model(layer):
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend uses native MXNet Batchnorm. To be fixed.')
-@keras_test
-@pytest.mark.parametrize('layer_class,layer_args', [
+@pytest.mark.parametrize('layer_class,args', [
     (layers.GRU, {'units': 2, 'input_shape': [3, 5]}),
     (layers.GRU, {'units': 2, 'input_shape': [3, 5], 'reset_after': True}),
     (layers.LSTM, {'units': 2, 'input_shape': [3, 5]}),
 ])
-def test_preprocess_weights_for_loading_rnn_should_be_idempotent(layer_class, layer_args):
+def test_preprocess_weights_for_loading_rnn_should_be_idempotent(layer_class, args):
     """
     Loading weights from a RNN class to itself should not convert the weights.
     """
     # layer can be instantiated only for supported backends
-    layer = layer_class(**layer_args)
+    layer = layer_class(**args)
     # A model is needed to initialize weights.
     _ = Sequential([layer])
     weights1 = layer.get_weights()
@@ -649,19 +635,18 @@ def test_preprocess_weights_for_loading_rnn_should_be_idempotent(layer_class, la
     assert all([np.allclose(x, y, 1e-5) for (x, y) in zip(weights1, weights2)])
 
 
-@keras_test
-@pytest.mark.parametrize('layer_class,layer_args', [
+@pytest.mark.parametrize('layer_class,args', [
     (layers.CuDNNGRU, {'units': 2, 'input_shape': [3, 5]}),
     (layers.CuDNNLSTM, {'units': 2, 'input_shape': [3, 5]}),
 ])
 @skipif_no_tf_gpu
-def test_preprocess_weights_for_loading_cudnn_rnn_should_be_idempotent(layer_class, layer_args):
-    test_preprocess_weights_for_loading_rnn_should_be_idempotent(layer_class, layer_args)
+def test_preprocess_weights_for_loading_cudnn_rnn_should_be_idempotent(layer_class,
+                                                                       args):
+    test_preprocess_weights_for_loading_rnn_should_be_idempotent(layer_class, args)
 
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend uses native MXNet Batchnorm. To be fixed.')
-@keras_test
 def test_recursion_with_bn_and_loss():
     model1 = Sequential([
         layers.Dense(5, input_dim=5, activity_regularizer='l1'),
@@ -690,7 +675,6 @@ def test_recursion_with_bn_and_loss():
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not fully support Embedding layer yet.')
-@keras_test
 def test_activity_regularization_with_model_composition():
 
     def reg(x):
@@ -713,14 +697,12 @@ def reg(x):
     assert loss == 4
 
 
-@pytest.mark.skipif(K.backend() == 'mxnet',
-                    reason='MXNet backend does not fully support Embedding layer yet.')
-@keras_test
 def test_shared_layer_depth_is_correct():
-    # Basic outline here: we have a shared embedding layer, and two inputs that go through
-    # different depths of computation in the graph before the final output.  We need the computed
-    # depth of the input layers to be the same, because they both pass through the embedding layer
-    # before anything else happens.  That's what we're testing.
+    # Basic outline here: we have a shared embedding layer, and two inputs that
+    # go through different depths of computation in the graph before
+    # the final output.  We need the computed depth of the input layers to be
+    # the same, because they both pass through the embedding layer before anything
+    # else happens.  That's what we're testing.
     from keras.layers import Embedding, Input, Dense, Concatenate
     from keras.models import Model
     input1 = Input(shape=(10,), name='input1')
@@ -743,7 +725,6 @@ def test_shared_layer_depth_is_correct():
     assert input1_depth == input2_depth
 
 
-@keras_test
 def test_layer_sharing_at_heterogeneous_depth():
     x_val = np.random.random((10, 5))
 
@@ -765,7 +746,6 @@ def test_layer_sharing_at_heterogeneous_depth():
     np.testing.assert_allclose(output_val, output_val_2, atol=1e-6)
 
 
-@keras_test
 def test_layer_sharing_at_heterogeneous_depth_with_concat():
     input_shape = (16, 9, 3)
     input_layer = Input(shape=input_shape)
@@ -793,39 +773,39 @@ def test_layer_sharing_at_heterogeneous_depth_with_concat():
     np.testing.assert_allclose(output_val, output_val_2, atol=1e-6)
 
 
-@keras_test
 def test_multi_output_mask():
     """Fixes #7589"""
-    class ArbitraryMultiOutputLayer(Layer):
+    class TestMultiOutputLayer(Layer):
         def __init__(self, **kwargs):
-            super(ArbitraryMultiOutputLayer, self).__init__(**kwargs)
+            super(TestMultiOutputLayer, self).__init__(**kwargs)
 
         def call(self, inputs, **kwargs):
             return [K.abs(inputs), K.abs(inputs)]
 
         def compute_output_shape(self, input_shape):
-            out_shape = super(ArbitraryMultiOutputLayer, self).compute_output_shape(input_shape)
+            out_shape = super(TestMultiOutputLayer, self).compute_output_shape(
+                input_shape)
             return [out_shape, out_shape]
 
-    class ArbitraryMultiInputLayer(Layer):
+    class TestMultiInputLayer(Layer):
         def __init__(self, **kwargs):
-            super(ArbitraryMultiInputLayer, self).__init__(**kwargs)
+            super(TestMultiInputLayer, self).__init__(**kwargs)
 
         def call(self, inputs, **kwargs):
             negative, positive = inputs
             return negative + positive
 
     input_layer = Input(shape=(16, 16, 3))
-    x, y = ArbitraryMultiOutputLayer()(input_layer)
-    z = ArbitraryMultiInputLayer()([x, y])
+    x, y = TestMultiOutputLayer()(input_layer)
+    z = TestMultiInputLayer()([x, y])
     _ = Model(inputs=input_layer, outputs=z)
     assert K.int_shape(z)[1:] == (16, 16, 3)
 
 
-@keras_test
 def test_constant_initializer_with_numpy():
     model = Sequential()
-    model.add(Dense(2, input_shape=(3,), kernel_initializer=Constant(np.ones((3, 2)))))
+    model.add(Dense(2, input_shape=(3,),
+                    kernel_initializer=Constant(np.ones((3, 2)))))
     model.add(Dense(3))
     model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
 
diff --git a/tests/keras/engine/test_training.py b/tests/keras/engine/test_training.py
index eec40b417c9..24b488d90b8 100644
--- a/tests/keras/engine/test_training.py
+++ b/tests/keras/engine/test_training.py
@@ -1,3 +1,5 @@
+import threading
+
 import pytest
 import numpy as np
 import pandas as pd
@@ -15,7 +17,6 @@
 from keras.models import Sequential
 from keras import backend as K
 from keras.utils import Sequence
-from keras.utils.test_utils import keras_test
 from keras.callbacks import LambdaCallback
 
 
@@ -23,11 +24,13 @@ class RandomSequence(Sequence):
     def __init__(self, batch_size, sequence_length=12):
         self.batch_size = batch_size
         self.sequence_length = sequence_length
+        self.logs = []  # It will work for use_multiprocessing=False
 
     def __len__(self):
         return self.sequence_length
 
     def __getitem__(self, idx):
+        self.logs.append(idx)
         return ([np.random.random((self.batch_size, 3)),
                  np.random.random((self.batch_size, 3))],
                 [np.random.random((self.batch_size, 4)),
@@ -37,7 +40,36 @@ def on_epoch_end(self):
         pass
 
 
-@keras_test
+class threadsafe_iter:
+    """Takes an iterator/generator and makes it thread-safe by
+    serializing call to the `next` method of given iterator/generator.
+    """
+
+    def __init__(self, it):
+        self.it = it
+        self.lock = threading.Lock()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return self.next()
+
+    def next(self):
+        with self.lock:
+            return next(self.it)
+
+
+def threadsafe_generator(f):
+    """A decorator that takes a generator function and makes it thread-safe.
+    """
+
+    def g(*a, **kw):
+        return threadsafe_iter(f(*a, **kw))
+
+    return g
+
+
 def test_check_array_length_consistency():
     training_utils.check_array_length_consistency(None, None, None)
     a_np = np.random.random((4, 3, 3))
@@ -59,7 +91,6 @@ def test_check_array_length_consistency():
         training_utils.check_array_length_consistency([a_np], None, [b_np])
 
 
-@keras_test
 def testslice_arrays():
     input_a = np.random.random((10, 3))
     slice_arrays(None)
@@ -80,7 +111,6 @@ def testslice_arrays():
     slice_arrays(input_a, stop=2)
 
 
-@keras_test
 def test_weighted_masked_objective():
     a = Input(shape=(3,), name='input_a')
 
@@ -93,7 +123,6 @@ def mask_dummy(y_true=None, y_pred=None, weight=None):
     weighted_function(a, a, None)
 
 
-@keras_test
 def test_model_methods():
     a = Input(shape=(3,), name='input_a')
     b = Input(shape=(3,), name='input_b')
@@ -262,6 +291,7 @@ def on_batch_begin(batch, logs):
     # test starting from non-zero initial epoch for generator too
     trained_epochs = []
 
+    @threadsafe_generator
     def gen_data(batch_sz):
         while True:
             yield ([np.random.random((batch_sz, 3)),
@@ -307,9 +337,11 @@ def mse(y_true, y_pred):
 
     # empty batch
     with pytest.raises(ValueError):
+        @threadsafe_generator
         def gen_data():
             while True:
                 yield (np.asarray([]), np.asarray([]))
+
         out = model.evaluate_generator(gen_data(), steps=1)
 
     # x is not a list of numpy arrays.
@@ -391,9 +423,10 @@ def gen_data():
 
     # the rank of weight arrays should be 1.
     with pytest.raises(ValueError):
-        out = model.train_on_batch([input_a_np, input_b_np],
-                                   [output_a_np, output_b_np],
-                                   sample_weight=[None, np.random.random((10, 20, 30))])
+        out = model.train_on_batch(
+            [input_a_np, input_b_np],
+            [output_a_np, output_b_np],
+            sample_weight=[None, np.random.random((10, 20, 30))])
 
     model.compile(optimizer, loss='mse',
                   sample_weight_mode={'dense_1': None, 'dropout': 'temporal'})
@@ -409,33 +442,53 @@ def gen_data():
                   sample_weight_mode=None)
     trained_epochs = []
     trained_batches = []
+    val_seq = RandomSequence(4)
     out = model.fit_generator(generator=RandomSequence(3),
                               steps_per_epoch=3,
                               epochs=5,
                               initial_epoch=0,
-                              validation_data=RandomSequence(4),
+                              validation_data=val_seq,
                               validation_steps=3,
+                              max_queue_size=1,
                               callbacks=[tracker_cb])
     assert trained_epochs == [0, 1, 2, 3, 4]
     assert trained_batches == list(range(3)) * 5
+    assert len(val_seq.logs) <= 4 * 5
 
     # steps_per_epoch will be equal to len of sequence if it's unspecified
     trained_epochs = []
     trained_batches = []
+    val_seq = RandomSequence(4)
     out = model.fit_generator(generator=RandomSequence(3),
                               epochs=5,
                               initial_epoch=0,
-                              validation_data=RandomSequence(4),
+                              validation_data=val_seq,
                               callbacks=[tracker_cb])
     assert trained_epochs == [0, 1, 2, 3, 4]
     assert trained_batches == list(range(12)) * 5
+    assert len(val_seq.logs) == 12 * 5
+
+    # test for workers = 0
+    trained_epochs = []
+    trained_batches = []
+    val_seq = RandomSequence(4)
+    out = model.fit_generator(generator=RandomSequence(3),
+                              epochs=5,
+                              validation_data=val_seq,
+                              callbacks=[tracker_cb],
+                              workers=0)
+    assert trained_epochs == [0, 1, 2, 3, 4]
+    assert trained_batches == list(range(12)) * 5
+    assert len(val_seq.logs) == 12 * 5
 
     # fit_generator will throw an exception
     # if steps is unspecified for regular generator
     with pytest.raises(ValueError):
+        @threadsafe_generator
         def gen_data():
             while True:
                 yield (np.asarray([]), np.asarray([]))
+
         out = model.fit_generator(generator=gen_data(), epochs=5,
                                   initial_epoch=0, validation_data=gen_data(),
                                   callbacks=[tracker_cb])
@@ -443,6 +496,7 @@ def gen_data():
     # Check if generator is only accessed an expected number of times
     gen_counters = [0, 0]
 
+    @threadsafe_generator
     def gen_data(i):
         while True:
             gen_counters[i] += 1
@@ -457,7 +511,9 @@ def gen_data(i):
 
     # Need range check here as filling
     # of the queue depends on sleep in the enqueuers
-    assert 6 <= gen_counters[0] <= 8
+    max_train = 3 * 2 + 2 * 2
+    min_train = 2 * 3
+    assert min_train <= gen_counters[0] <= max_train
     # 12 = (epoch * workers * validation steps * max_queue_size)
     assert 3 <= gen_counters[1] <= 12
 
@@ -520,7 +576,6 @@ def expected_shape(batch_size, n_batches):
 
 @pytest.mark.skipif(sys.version_info < (3,),
                     reason='Cannot catch warnings in python 2')
-@keras_test
 def test_warnings():
     a = Input(shape=(3,), name='input_a')
     b = Input(shape=(3,), name='input_b')
@@ -537,6 +592,7 @@ def test_warnings():
     model.compile(optimizer, loss, metrics=[], loss_weights=loss_weights,
                   sample_weight_mode=None)
 
+    @threadsafe_generator
     def gen_data(batch_sz):
         while True:
             yield ([np.random.random((batch_sz, 3)),
@@ -557,14 +613,10 @@ def gen_data(batch_sz):
                                   steps_per_epoch=4,
                                   use_multiprocessing=True,
                                   workers=2)
-    assert all(['Sequence' not in str(w_.message) for w_ in w]), 'A warning was raised for Sequence.'
+    assert all(['Sequence' not in str(w_.message) for w_ in w]), (
+        'A warning was raised for Sequence.')
 
 
-@pytest.mark.skipif(K.backend() == 'cntk',
-                    reason='sparse operations not supported by CNTK')
-@pytest.mark.skipif(K.backend() == 'theano',
-                    reason='sparse operations not supported by Theano')
-@keras_test
 def test_sparse_inputs_targets():
     test_inputs = [sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)]
     test_outputs = [sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)]
@@ -580,8 +632,8 @@ def test_sparse_inputs_targets():
     model.evaluate(test_inputs, test_outputs, batch_size=2)
 
 
-@pytest.mark.skipif(K.backend() != 'tensorflow', reason='sparse operations supported only by TensorFlow')
-@keras_test
+@pytest.mark.skipif(K.backend() != 'tensorflow',
+                    reason='sparse operations supported only by TensorFlow')
 def test_sparse_placeholder_fit():
     test_inputs = [sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)]
     test_outputs = [sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)]
@@ -597,7 +649,6 @@ def test_sparse_placeholder_fit():
     model.evaluate(test_inputs, test_outputs, batch_size=2)
 
 
-@keras_test
 def test_trainable_argument():
     x = np.random.random((5, 3))
     y = np.random.random((5, 2))
@@ -621,7 +672,6 @@ def test_trainable_argument():
     assert_allclose(out, out_2)
 
 
-@keras_test
 def test_with_list_as_targets():
     model = Sequential()
     model.add(Dense(1, input_dim=3, trainable=False))
@@ -632,7 +682,6 @@ def test_with_list_as_targets():
     model.train_on_batch(x, y)
 
 
-@keras_test
 def test_check_not_failing():
     a = np.random.random((2, 1, 3))
     training_utils.check_loss_and_target_compatibility(
@@ -641,7 +690,6 @@ def test_check_not_failing():
         [a], [losses.categorical_crossentropy], [(2, None, 3)])
 
 
-@keras_test
 def test_check_last_is_one():
     a = np.random.random((2, 3, 1))
     with pytest.raises(ValueError) as exc:
@@ -651,7 +699,6 @@ def test_check_last_is_one():
     assert 'You are passing a target array' in str(exc)
 
 
-@keras_test
 def test_check_bad_shape():
     a = np.random.random((2, 3, 5))
     with pytest.raises(ValueError) as exc:
@@ -661,8 +708,8 @@ def test_check_bad_shape():
     assert 'targets to have the same shape' in str(exc)
 
 
-@pytest.mark.skipif(K.backend() != 'tensorflow', reason='Requires TensorFlow backend')
-@keras_test
+@pytest.mark.skipif(K.backend() != 'tensorflow',
+                    reason='Requires TensorFlow backend')
 def test_model_with_input_feed_tensor():
     """We test building a model with a TF variable as input.
     We should be able to call fit, evaluate, predict,
@@ -806,7 +853,6 @@ def test_model_with_input_feed_tensor():
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support models with partial loss yet.')
-@keras_test
 def test_model_with_partial_loss():
     a = Input(shape=(3,), name='input_a')
     a_2 = Dense(4, name='dense_1')(a)
@@ -848,7 +894,6 @@ def test_model_with_partial_loss():
     out = model.evaluate(input_a_np, [output_a_np])
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'cntk' or K.backend() == 'mxnet'),
                     reason='cntk/mxnet do not support external loss yet')
 def test_model_with_external_loss():
@@ -921,6 +966,7 @@ def test_model_with_external_loss():
         out = model.fit(None, None, epochs=1, steps_per_epoch=1)
 
         # define a generator to produce x=None and y=None
+        @threadsafe_generator
         def data_tensors_generator():
             while True:
                 yield (None, None)
@@ -1001,7 +1047,6 @@ def data_tensors_generator():
         assert out[1].shape == (10 * 3, 4)
 
 
-@keras_test
 def test_target_tensors():
     # single-output, as list
     model = keras.models.Sequential()
@@ -1017,6 +1062,11 @@ def test_target_tensors():
                   target_tensors={'dense': target})
     model.train_on_batch(input_val, None)
 
+    # single-output, as tensor
+    model.compile(optimizer='rmsprop', loss='mse',
+                  target_tensors=target)
+    model.train_on_batch(input_val, None)
+
     # test invalid arguments
     with pytest.raises(TypeError):
         model.compile(optimizer='rmsprop', loss='mse',
@@ -1053,6 +1103,20 @@ def test_target_tensors():
                                   'dense_b': target_b})
     model.train_on_batch(input_val, None)
 
+    # multi-output, not enough target tensors when `target_tensors` is not a dict
+    with pytest.raises(ValueError,
+                       match='When passing a list as `target_tensors`, it should '
+                             'have one entry per model output. The model has \d '
+                             'outputs, but you passed target_tensors='):
+        model.compile(optimizer='rmsprop', loss='mse',
+                      target_tensors=[target_a])
+    with pytest.raises(ValueError,
+                       match='The model has \d outputs, but you passed a single '
+                             'tensor as `target_tensors`. Expected a list or '
+                             'a dict of tensors.'):
+        model.compile(optimizer='rmsprop', loss='mse',
+                      target_tensors=target_a)
+
     # test with sample weights
     model.compile(optimizer='rmsprop', loss='mse',
                   target_tensors=[target_a, target_b])
@@ -1060,7 +1124,6 @@ def test_target_tensors():
                          sample_weight={'dense_a': np.random.random((10,))})
 
 
-@keras_test
 def test_model_custom_target_tensors():
     a = Input(shape=(3,), name='input_a')
     b = Input(shape=(3,), name='input_b')
@@ -1122,8 +1185,8 @@ def test_model_custom_target_tensors():
                              [output_a_np, output_b_np])
 
 
-@pytest.mark.skipif(sys.version_info < (3,), reason='Cannot catch warnings in python 2')
-@keras_test
+@pytest.mark.skipif(sys.version_info < (3,),
+                    reason='Cannot catch warnings in python 2')
 def test_trainable_weights_count_consistency():
     """Tests the trainable weights consistency check of Model.
 
@@ -1148,22 +1211,24 @@ def test_trainable_weights_count_consistency():
     with pytest.warns(UserWarning) as w:
         model2.summary()
     warning_raised = any(['Discrepancy' in str(w_.message) for w_ in w])
-    assert warning_raised, 'No warning raised when trainable is modified without .compile.'
+    assert warning_raised, (
+        'No warning raised when trainable is modified without .compile.')
 
     # And on .fit()
     with pytest.warns(UserWarning) as w:
         model2.fit(x=np.zeros((5, 3)), y=np.zeros((5, 1)))
     warning_raised = any(['Discrepancy' in str(w_.message) for w_ in w])
-    assert warning_raised, 'No warning raised when trainable is modified without .compile.'
+    assert warning_raised, (
+        'No warning raised when trainable is modified without .compile.')
 
     # And shouldn't warn if we recompile
     model2.compile(optimizer='adam', loss='mse')
     with pytest.warns(None) as w:
         model2.summary()
-    assert len(w) == 0, "Warning raised even when .compile() is called after modifying .trainable"
+    assert len(w) == 0, (
+        'Warning raised even when .compile() is called after modifying .trainable')
 
 
-@keras_test
 def test_pandas_dataframe():
     input_a = Input(shape=(3,), name='input_a')
     input_b = Input(shape=(3,), name='input_b')
@@ -1243,7 +1308,6 @@ def test_pandas_dataframe():
                           [output_a_df, output_b_df])
 
 
-@keras_test
 @pytest.mark.skipif(K.backend() != 'tensorflow', reason='Requires TensorFlow')
 @pytest.mark.skipif((K.backend() == 'tensorflow' and
                      not hasattr(K.get_session(),
@@ -1272,7 +1336,6 @@ def test_training_and_eval_methods_on_symbolic_tensors_single_io():
               validation_data=(inputs, targets), validation_steps=2)
 
 
-@keras_test
 @pytest.mark.skipif(K.backend() != 'tensorflow', reason='Requires TensorFlow')
 @pytest.mark.skipif((K.backend() == 'tensorflow' and
                      not hasattr(K.get_session(),
@@ -1370,7 +1433,6 @@ def test_training_and_eval_methods_on_symbolic_tensors_multi_io():
     model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
 
 
-@keras_test
 def test_model_with_crossentropy_losses_channels_first():
     """Tests use of all crossentropy losses with `channels_first`.
 
@@ -1380,6 +1442,7 @@ def test_model_with_crossentropy_losses_channels_first():
     `channels_first` or `channels_last` image_data_format.
     Tests PR #9715.
     """
+
     def prepare_simple_model(input_tensor, loss_name, target):
         axis = 1 if K.image_data_format() == 'channels_first' else -1
         if loss_name == 'sparse_categorical_crossentropy':
@@ -1452,7 +1515,6 @@ def prepare_simple_model(input_tensor, loss_name, target):
                                           'channels_first and channels_last.'))
 
 
-@keras_test
 def test_dynamic_set_inputs():
     model = Sequential()
     model.add(Dense(16, input_dim=32))
diff --git a/tests/keras/initializers_test.py b/tests/keras/initializers_test.py
index 0479a18187d..ffa91b40c6c 100644
--- a/tests/keras/initializers_test.py
+++ b/tests/keras/initializers_test.py
@@ -104,9 +104,11 @@ def test_orthogonal(tensor_shape):
             target_mean=0.)
 
 
-@pytest.mark.parametrize('tensor_shape', [(100, 100), (1, 2, 3, 4)], ids=['FC', 'CONV'])
+@pytest.mark.parametrize('tensor_shape',
+                         [(100, 100), (10, 20), (30, 80), (1, 2, 3, 4)],
+                         ids=['FC', 'RNN', 'RNN_INVALID', 'CONV'])
 def test_identity(tensor_shape):
-    if len(tensor_shape) > 2:
+    if len(tensor_shape) > 2 or max(tensor_shape) % min(tensor_shape) != 0:
         with pytest.raises(ValueError):
             _runner(initializers.identity(), tensor_shape,
                     target_mean=1. / tensor_shape[0], target_max=1.)
diff --git a/tests/keras/layers/advanced_activations_test.py b/tests/keras/layers/advanced_activations_test.py
index 3e6092f9eeb..ba595e5a199 100644
--- a/tests/keras/layers/advanced_activations_test.py
+++ b/tests/keras/layers/advanced_activations_test.py
@@ -1,54 +1,80 @@
 import pytest
 from keras.utils.test_utils import layer_test
-from keras.utils.test_utils import keras_test
 from keras import layers
+from keras import backend as K
 
 
-@keras_test
 def test_leaky_relu():
     for alpha in [0., .5, -1.]:
         layer_test(layers.LeakyReLU, kwargs={'alpha': alpha},
                    input_shape=(2, 3, 4))
 
 
-@keras_test
 def test_prelu():
     layer_test(layers.PReLU, kwargs={},
                input_shape=(2, 3, 4))
 
 
-@keras_test
 def test_prelu_share():
     layer_test(layers.PReLU, kwargs={'shared_axes': 1},
                input_shape=(2, 3, 4))
 
 
-@keras_test
 def test_elu():
     for alpha in [0., .5, -1.]:
         layer_test(layers.ELU, kwargs={'alpha': alpha},
                    input_shape=(2, 3, 4))
 
 
-@keras_test
 def test_thresholded_relu():
     layer_test(layers.ThresholdedReLU, kwargs={'theta': 0.5},
                input_shape=(2, 3, 4))
 
 
-@keras_test
 def test_softmax():
     for axis in [1, -1]:
         layer_test(layers.Softmax, kwargs={'axis': axis},
                    input_shape=(2, 3, 4))
 
 
-@keras_test
 def test_relu():
-    for max_value in [None, 1., 6.]:
-        layer_test(layers.ReLU, kwargs={'max_value': max_value},
+    layer_test(layers.ReLU,
+               kwargs={'max_value': 10,
+                       'negative_slope': 0.2,
+                       'threshold': 3.0},
+               input_shape=(2, 3, 4))
+    layer_test(layers.ReLU,
+               kwargs={'max_value': 6},
+               input_shape=(2, 3, 4))
+    layer_test(layers.ReLU,
+               kwargs={'negative_slope': 0.2},
+               input_shape=(2, 3, 4))
+
+    # max_value of ReLU layer cannot be negative value
+    with pytest.raises(ValueError):
+        layer_test(layers.ReLU, kwargs={'max_value': -2.0},
                    input_shape=(2, 3, 4))
 
+    # negative_slope of ReLU layer cannot be negative value
+    with pytest.raises(ValueError):
+        layer_test(layers.ReLU, kwargs={'negative_slope': -2.0},
+                   input_shape=(2, 3, 4))
+
+
+@pytest.mark.skipif((K.backend() != 'tensorflow'),
+                    reason='TF-specific implementation.')
+def test_relu_tf_ops():
+    inputs = layers.Input((3,))
+    # Test that `relu` op gets used.
+    outputs = layers.ReLU()(inputs)
+    assert outputs.op.name.lower().endswith('/relu')
+    # Test that `leakyrelu` op gets used.
+    outputs = layers.ReLU(negative_slope=0.2)(inputs)
+    assert outputs.op.name.lower().endswith('/leakyrelu')
+    # Test that `relu6` op gets used.
+    outputs = layers.ReLU(max_value=6)(inputs)
+    assert outputs.op.name.lower().endswith('/relu6')
+
 
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/layers/convolutional_recurrent_test.py b/tests/keras/layers/convolutional_recurrent_test.py
index 9759f530f73..f4987036b51 100644
--- a/tests/keras/layers/convolutional_recurrent_test.py
+++ b/tests/keras/layers/convolutional_recurrent_test.py
@@ -8,20 +8,22 @@
 from keras.utils.test_utils import layer_test
 from keras import regularizers
 
+num_row = 3
+num_col = 3
+filters = 2
+num_samples = 1
+input_channel = 2
+input_num_row = 5
+input_num_col = 5
+sequence_len = 2
+
 
 pytestmark = pytest.mark.skipif(K.backend() == 'mxnet',
                                 reason='MXNet backend does not support ConvLSTM2D Layer yet.')
 
 
 def test_convolutional_recurrent():
-    num_row = 3
-    num_col = 3
-    filters = 2
-    num_samples = 1
-    input_channel = 2
-    input_num_row = 5
-    input_num_col = 5
-    sequence_len = 2
+
     for data_format in ['channels_first', 'channels_last']:
 
         if data_format == 'channels_first':
@@ -63,97 +65,101 @@ def test_convolutional_recurrent():
                                         'padding': 'valid'},
                                 input_shape=inputs.shape)
 
-            # No need to check following tests for both data formats
-            if data_format == 'channels_first' or return_sequences:
-                continue
 
-            # Tests for statefulness
-            model = Sequential()
-            kwargs = {'data_format': data_format,
-                      'return_sequences': return_sequences,
-                      'filters': filters,
-                      'kernel_size': (num_row, num_col),
-                      'stateful': True,
-                      'batch_input_shape': inputs.shape,
-                      'padding': 'same'}
-            layer = convolutional_recurrent.ConvLSTM2D(**kwargs)
-
-            model.add(layer)
-            model.compile(optimizer='sgd', loss='mse')
-            out1 = model.predict(np.ones_like(inputs))
-
-            # train once so that the states change
-            model.train_on_batch(np.ones_like(inputs),
-                                 np.random.random(out1.shape))
-            out2 = model.predict(np.ones_like(inputs))
-
-            # if the state is not reset, output should be different
-            assert(out1.max() != out2.max())
-
-            # check that output changes after states are reset
-            # (even though the model itself didn't change)
-            layer.reset_states()
-            out3 = model.predict(np.ones_like(inputs))
-            assert(out2.max() != out3.max())
-
-            # check that container-level reset_states() works
-            model.reset_states()
-            out4 = model.predict(np.ones_like(inputs))
-            assert_allclose(out3, out4, atol=1e-5)
-
-            # check that the call to `predict` updated the states
-            out5 = model.predict(np.ones_like(inputs))
-            assert(out4.max() != out5.max())
-
-            # cntk doesn't support eval convolution with static
-            # variable, will enable it later
-            if K.backend() != 'cntk':
-                # check regularizers
-                kwargs = {'data_format': data_format,
-                          'return_sequences': return_sequences,
-                          'kernel_size': (num_row, num_col),
-                          'stateful': True,
-                          'filters': filters,
-                          'batch_input_shape': inputs.shape,
-                          'kernel_regularizer': regularizers.L1L2(l1=0.01),
-                          'recurrent_regularizer': regularizers.L1L2(l1=0.01),
-                          'bias_regularizer': 'l2',
-                          'activity_regularizer': 'l2',
-                          'kernel_constraint': 'max_norm',
-                          'recurrent_constraint': 'max_norm',
-                          'bias_constraint': 'max_norm',
-                          'padding': 'same'}
-
-                layer = convolutional_recurrent.ConvLSTM2D(**kwargs)
-                layer.build(inputs.shape)
-                assert len(layer.losses) == 3
-                assert layer.activity_regularizer
-                output = layer(K.variable(np.ones(inputs.shape)))
-                assert len(layer.losses) == 4
-                K.eval(output)
-
-            # check dropout
-            layer_test(convolutional_recurrent.ConvLSTM2D,
-                       kwargs={'data_format': data_format,
-                               'return_sequences': return_sequences,
-                               'filters': filters,
-                               'kernel_size': (num_row, num_col),
-                               'padding': 'same',
-                               'dropout': 0.1,
-                               'recurrent_dropout': 0.1},
-                       input_shape=inputs.shape)
-
-            # check state initialization
-            layer = convolutional_recurrent.ConvLSTM2D(filters=filters,
-                                                       kernel_size=(num_row, num_col),
-                                                       data_format=data_format,
-                                                       return_sequences=return_sequences)
-            layer.build(inputs.shape)
-            x = Input(batch_shape=inputs.shape)
-            initial_state = layer.get_initial_state(x)
-            y = layer(x, initial_state=initial_state)
-            model = Model(x, y)
-            assert model.predict(inputs).shape == layer.compute_output_shape(inputs.shape)
+def test_convolutional_recurrent_statefulness():
+
+    data_format = 'channels_last'
+    return_sequences = False
+    inputs = np.random.rand(num_samples, sequence_len,
+                            input_num_row, input_num_col,
+                            input_channel)
+    # Tests for statefulness
+    model = Sequential()
+    kwargs = {'data_format': data_format,
+              'return_sequences': return_sequences,
+              'filters': filters,
+              'kernel_size': (num_row, num_col),
+              'stateful': True,
+              'batch_input_shape': inputs.shape,
+              'padding': 'same'}
+    layer = convolutional_recurrent.ConvLSTM2D(**kwargs)
+
+    model.add(layer)
+    model.compile(optimizer='sgd', loss='mse')
+    out1 = model.predict(np.ones_like(inputs))
+
+    # train once so that the states change
+    model.train_on_batch(np.ones_like(inputs),
+                         np.random.random(out1.shape))
+    out2 = model.predict(np.ones_like(inputs))
+
+    # if the state is not reset, output should be different
+    assert(out1.max() != out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones_like(inputs))
+    assert(out2.max() != out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones_like(inputs))
+    assert_allclose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones_like(inputs))
+    assert(out4.max() != out5.max())
+
+    # cntk doesn't support eval convolution with static
+    # variable, will enable it later
+    if K.backend() != 'cntk':
+        # check regularizers
+        kwargs = {'data_format': data_format,
+                  'return_sequences': return_sequences,
+                  'kernel_size': (num_row, num_col),
+                  'stateful': True,
+                  'filters': filters,
+                  'batch_input_shape': inputs.shape,
+                  'kernel_regularizer': regularizers.L1L2(l1=0.01),
+                  'recurrent_regularizer': regularizers.L1L2(l1=0.01),
+                  'bias_regularizer': 'l2',
+                  'activity_regularizer': 'l2',
+                  'kernel_constraint': 'max_norm',
+                  'recurrent_constraint': 'max_norm',
+                  'bias_constraint': 'max_norm',
+                  'padding': 'same'}
+
+        layer = convolutional_recurrent.ConvLSTM2D(**kwargs)
+        layer.build(inputs.shape)
+        assert len(layer.losses) == 3
+        assert layer.activity_regularizer
+        output = layer(K.variable(np.ones(inputs.shape)))
+        assert len(layer.losses) == 4
+        K.eval(output)
+
+    # check dropout
+    layer_test(convolutional_recurrent.ConvLSTM2D,
+               kwargs={'data_format': data_format,
+                       'return_sequences': return_sequences,
+                       'filters': filters,
+                       'kernel_size': (num_row, num_col),
+                       'padding': 'same',
+                       'dropout': 0.1,
+                       'recurrent_dropout': 0.1},
+               input_shape=inputs.shape)
+
+    # check state initialization
+    layer = convolutional_recurrent.ConvLSTM2D(
+        filters=filters, kernel_size=(num_row, num_col),
+        data_format=data_format, return_sequences=return_sequences)
+    layer.build(inputs.shape)
+    x = Input(batch_shape=inputs.shape)
+    initial_state = layer.get_initial_state(x)
+    y = layer(x, initial_state=initial_state)
+    model = Model(x, y)
+    assert (model.predict(inputs).shape ==
+            layer.compute_output_shape(inputs.shape))
 
 
 if __name__ == '__main__':
diff --git a/tests/keras/layers/convolutional_test.py b/tests/keras/layers/convolutional_test.py
index 71b395e3e04..e5df6fdd4e1 100644
--- a/tests/keras/layers/convolutional_test.py
+++ b/tests/keras/layers/convolutional_test.py
@@ -3,10 +3,8 @@
 from numpy.testing import assert_allclose
 
 from keras.utils.test_utils import layer_test
-from keras.utils.test_utils import keras_test
 from keras import backend as K
 from keras.layers import convolutional
-from keras.layers import pooling
 from keras.models import Sequential
 
 
@@ -17,151 +15,106 @@
     _convolution_paddings = ['valid', 'same']
 
 
-@keras_test
-def test_causal_dilated_conv():
-    # Causal:
-    # specify to use channels_last data format,
-    # as default data format for Conv1D is None now
-    layer_test(convolutional.Conv1D,
-               input_data=np.reshape(np.arange(4, dtype='float32'), (1, 4, 1)),
-               kwargs={
-                   'filters': 1,
-                   'kernel_size': 2,
-                   'dilation_rate': 1,
-                   'padding': 'causal',
-                   'kernel_initializer': 'ones',
-                   'use_bias': False,
-                   'data_format': 'channels_last'
-               },
-               expected_output=[[[0], [1], [3], [5]]]
-               )
-
-    # Non-causal:
-    layer_test(convolutional.Conv1D,
-               input_data=np.reshape(np.arange(4, dtype='float32'), (1, 4, 1)),
-               kwargs={
-                   'filters': 1,
-                   'kernel_size': 2,
-                   'dilation_rate': 1,
-                   'padding': 'valid',
-                   'kernel_initializer': 'ones',
-                   'use_bias': False,
-                   'data_format': 'channels_last'
-               },
-               expected_output=[[[1], [3], [5]]]
-               )
-
-    # Causal dilated with larger kernel size:
-    layer_test(convolutional.Conv1D,
-               input_data=np.reshape(np.arange(10, dtype='float32'), (1, 10, 1)),
-               kwargs={
-                   'filters': 1,
-                   'kernel_size': 3,
-                   'dilation_rate': 2,
-                   'padding': 'causal',
-                   'kernel_initializer': 'ones',
-                   'use_bias': False,
-                   'data_format': 'channels_last'
-               },
-               expected_output=np.float32([[[0], [1], [2], [4], [6], [9], [12], [15], [18], [21]]])
-               )
-
-
-@keras_test
-def test_conv_1d():
+@pytest.mark.skipif((K.backend() == 'cntk' and K.dev.type() == 0),
+                    reason='cntk only support dilated conv on GPU')
+@pytest.mark.parametrize(
+    'layer_kwargs,input_length,expected_output',
+    [
+        # Causal
+        ({'filters': 1, 'kernel_size': 2, 'dilation_rate': 1, 'padding': 'causal',
+          'kernel_initializer': 'ones', 'use_bias': False},
+         4, [[[0], [1], [3], [5]]]),
+        # Non-causal
+        ({'filters': 1, 'kernel_size': 2, 'dilation_rate': 1, 'padding': 'valid',
+          'kernel_initializer': 'ones', 'use_bias': False},
+         4, [[[1], [3], [5]]]),
+        # Causal dilated with larger kernel size
+        ({'filters': 1, 'kernel_size': 3, 'dilation_rate': 2, 'padding': 'causal',
+          'kernel_initializer': 'ones', 'use_bias': False},
+         10, np.float32([[[0], [1], [2], [4], [6], [9], [12], [15], [18], [21]]])),
+    ]
+)
+def test_causal_dilated_conv(layer_kwargs, input_length, expected_output):
+    input_data = np.reshape(np.arange(input_length, dtype='float32'),
+                            (1, input_length, 1))
+    layer_test(convolutional.Conv1D, input_data=input_data,
+               kwargs=layer_kwargs, expected_output=expected_output)
+
+
+@pytest.mark.parametrize(
+    'padding,strides',
+    [(padding, strides)
+     for padding in _convolution_paddings
+     for strides in [1, 2]
+     if not (padding == 'same' and strides != 1)]
+)
+def test_conv_1d(padding, strides):
     batch_size = 2
     steps = 8
     input_dim = 2
     kernel_size = 3
     filters = 3
 
-    # Add causal padding for testing
-    paddings = ['causal'] + _convolution_paddings if K.backend() != 'theano' else _convolution_paddings
-
-    for padding in paddings:
-        for strides in [1, 2]:
-            if K.image_data_format() == 'channels_first':
-                if padding == 'causal':
-                    # don't test causal with channels_first data as specified in convolutional.py:
-                    # When using causal padding in `Conv1D`,
-                    # `data_format` must be "channels_last" (temporal data).
-                    continue
-                input_shape = (batch_size, input_dim, steps)
-            else:
-                input_shape = (batch_size, steps, input_dim)
-            if padding == 'same' and strides != 1:
-                continue
-            layer_test(convolutional.Conv1D,
-                       kwargs={'filters': filters,
-                               'kernel_size': kernel_size,
-                               'padding': padding,
-                               'strides': strides},
-                       input_shape=input_shape)
-
-            layer_test(convolutional.Conv1D,
-                       kwargs={'filters': filters,
-                               'kernel_size': kernel_size,
-                               'padding': padding,
-                               'kernel_regularizer': 'l2',
-                               'bias_regularizer': 'l2',
-                               'activity_regularizer': 'l2',
-                               'kernel_constraint': 'max_norm',
-                               'bias_constraint': 'max_norm',
-                               'strides': strides},
-                       input_shape=input_shape)
-
-    # Test dilation
     layer_test(convolutional.Conv1D,
                kwargs={'filters': filters,
                        'kernel_size': kernel_size,
                        'padding': padding,
-                       'dilation_rate': 2},
+                       'strides': strides},
                input_shape=(batch_size, steps, input_dim))
 
-    # Test channels_first
     layer_test(convolutional.Conv1D,
                kwargs={'filters': filters,
                        'kernel_size': kernel_size,
-                       'data_format': 'channels_first'},
-               input_shape=(batch_size, input_dim, steps))
+                       'padding': padding,
+                       'kernel_regularizer': 'l2',
+                       'bias_regularizer': 'l2',
+                       'activity_regularizer': 'l2',
+                       'kernel_constraint': 'max_norm',
+                       'bias_constraint': 'max_norm',
+                       'strides': strides},
+               input_shape=(batch_size, steps, input_dim))
 
 
-@keras_test
-def test_maxpooling_1d():
-    # MXNet backend does not support pooling with same mode yet.
-    if K.backend() == 'mxnet':
-        padding_modes = ['valid']
-    else:
-        padding_modes = ['valid', 'same']
+@pytest.mark.skipif((K.backend() == 'cntk' and K.dev.type() == 0),
+                    reason='cntk only support dilated conv on GPU')
+def test_conv_1d_dilation():
+    batch_size = 2
+    steps = 8
+    input_dim = 2
+    kernel_size = 3
+    filters = 3
+    padding = _convolution_paddings[-1]
 
-    for padding in padding_modes:
-        for stride in [1, 2]:
-            layer_test(convolutional.MaxPooling1D,
-                       kwargs={'strides': stride,
-                               'padding': padding},
-                       input_shape=(3, 5, 4))
+    layer_test(convolutional.Conv1D,
+               kwargs={'filters': filters,
+                       'kernel_size': kernel_size,
+                       'padding': padding,
+                       'dilation_rate': 2},
+               input_shape=(batch_size, steps, input_dim))
 
 
-@keras_test
-def test_averagepooling_1d():
-    # MXNet backend does not support pooling with same mode yet.
-    if K.backend() == 'mxnet':
-        padding_modes = ['valid']
-    else:
-        padding_modes = ['valid', 'same']
+def test_conv_1d_channels_first():
+    batch_size = 2
+    steps = 8
+    input_dim = 2
+    kernel_size = 3
+    filters = 3
 
-    for padding in padding_modes:
-        for stride in [1, 2]:
-            layer_test(convolutional.AveragePooling1D,
-                       kwargs={'strides': stride,
-                               'padding': padding},
-                       input_shape=(3, 5, 4))
+    layer_test(convolutional.Conv1D,
+               kwargs={'filters': filters,
+                       'kernel_size': kernel_size,
+                       'data_format': 'channels_first'},
+               input_shape=(batch_size, input_dim, steps))
 
 
-@keras_test
-@pytest.mark.skipif((K.backend() == 'cntk'),
-                    reason='cntk does not support dilated conv')
-def test_convolution_2d():
+@pytest.mark.parametrize(
+    'strides,padding',
+    [(strides, padding)
+     for padding in _convolution_paddings
+     for strides in [(1, 1), (2, 2)]
+     if not (padding == 'same' and strides != (1, 1))]
+)
+def test_convolution_2d(strides, padding):
     num_samples = 2
     filters = 2
     stack_size = 3
@@ -169,18 +122,23 @@ def test_convolution_2d():
     num_row = 7
     num_col = 6
 
-    for padding in _convolution_paddings:
-        for strides in [(1, 1), (2, 2)]:
-            if padding == 'same' and strides != (1, 1):
-                continue
+    layer_test(convolutional.Conv2D,
+               kwargs={'filters': filters,
+                       'kernel_size': kernel_size,
+                       'padding': padding,
+                       'strides': strides,
+                       'data_format': 'channels_first'},
+               input_shape=(num_samples, stack_size, num_row, num_col))
 
-            layer_test(convolutional.Conv2D,
-                       kwargs={'filters': filters,
-                               'kernel_size': kernel_size,
-                               'padding': padding,
-                               'strides': strides,
-                               'data_format': 'channels_first'},
-                       input_shape=(num_samples, stack_size, num_row, num_col))
+
+def test_convolution_2d_channels_last():
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+    padding = 'valid'
+    strides = (2, 2)
 
     layer_test(convolutional.Conv2D,
                kwargs={'filters': filters,
@@ -196,7 +154,18 @@ def test_convolution_2d():
                        'strides': strides},
                input_shape=(num_samples, num_row, num_col, stack_size))
 
-    # Test dilation
+
+@pytest.mark.skipif((K.backend() == 'cntk' and K.dev.type() == 0),
+                    reason='cntk only supports dilated conv on GPU')
+def test_convolution_2d_dilation():
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    kernel_size = (3, 2)
+    num_row = 7
+    num_col = 6
+    padding = 'valid'
+
     layer_test(convolutional.Conv2D,
                kwargs={'filters': filters,
                        'kernel_size': kernel_size,
@@ -204,36 +173,84 @@ def test_convolution_2d():
                        'dilation_rate': (2, 2)},
                input_shape=(num_samples, num_row, num_col, stack_size))
 
-    # Test invalid use case
+
+def test_convolution_2d_invalid():
+    filters = 2
+    padding = _convolution_paddings[-1]
+    kernel_size = (3, 2)
+
     with pytest.raises(ValueError):
-        model = Sequential([convolutional.Conv2D(filters=filters,
-                                                 kernel_size=kernel_size,
-                                                 padding=padding,
-                                                 batch_input_shape=(None, None, 5, None))])
+        model = Sequential([convolutional.Conv2D(
+            filters=filters, kernel_size=kernel_size, padding=padding,
+            batch_input_shape=(None, None, 5, None))])
 
 
-@pytest.mark.skipif((K.backend() == 'mxnet'),
-                    reason='MXNet backend does not support Conv2D Transpose yet.')
-@keras_test
-def test_conv2d_transpose():
+@pytest.mark.parametrize(
+    'padding,out_padding,strides',
+    [(padding, out_padding, strides)
+     for padding in _convolution_paddings
+     for out_padding in [None, (0, 0), (1, 1)]
+     for strides in [(1, 1), (2, 2)]
+     if (not (padding == 'same' and strides != (1, 1))
+         and not (strides == (1, 1) and out_padding == (1, 1))
+         and not (padding == 'valid' and out_padding == (1, 1) and strides == (2, 2)))]
+)
+def test_conv2d_transpose(padding, out_padding, strides):
     num_samples = 2
     filters = 2
     stack_size = 3
     num_row = 5
     num_col = 6
 
-    for padding in _convolution_paddings:
-        for strides in [(1, 1), (2, 2)]:
-            if padding == 'same' and strides != (1, 1):
-                continue
-            layer_test(convolutional.Deconvolution2D,
-                       kwargs={'filters': filters,
-                               'kernel_size': 3,
-                               'padding': padding,
-                               'strides': strides,
-                               'data_format': 'channels_last'},
-                       input_shape=(num_samples, num_row, num_col, stack_size),
-                       fixed_batch_size=True)
+    layer_test(convolutional.Conv2DTranspose,
+               kwargs={'filters': filters,
+                       'kernel_size': 3,
+                       'padding': padding,
+                       'output_padding': out_padding,
+                       'strides': strides,
+                       'data_format': 'channels_last'},
+               input_shape=(num_samples, num_row, num_col, stack_size),
+               fixed_batch_size=True)
+
+
+@pytest.mark.skipif((K.backend() == 'cntk' and K.dev.type() == 0),
+                    reason='cntk only supports dilated conv transpose on GPU')
+def test_conv2d_transpose_dilation():
+
+    layer_test(convolutional.Conv2DTranspose,
+               kwargs={'filters': 2,
+                       'kernel_size': 3,
+                       'padding': 'same',
+                       'data_format': 'channels_last',
+                       'dilation_rate': (2, 2)},
+               input_shape=(2, 5, 6, 3))
+
+    # Check dilated conv transpose returns expected output
+    input_data = np.arange(48).reshape((1, 4, 4, 3)).astype(np.float32)
+    expected_output = np.float32([[192, 228, 192, 228],
+                                  [336, 372, 336, 372],
+                                  [192, 228, 192, 228],
+                                  [336, 372, 336, 372]]).reshape((1, 4, 4, 1))
+
+    layer_test(convolutional.Conv2DTranspose,
+               input_data=input_data,
+               kwargs={'filters': 1,
+                       'kernel_size': 3,
+                       'padding': 'same',
+                       'data_format': 'channels_last',
+                       'dilation_rate': (2, 2),
+                       'kernel_initializer': 'ones'},
+               expected_output=expected_output)
+
+
+def test_conv2d_transpose_channels_first():
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    num_row = 5
+    num_col = 6
+    padding = 'valid'
+    strides = (2, 2)
 
     layer_test(convolutional.Deconvolution2D,
                kwargs={'filters': filters,
@@ -250,42 +267,81 @@ def test_conv2d_transpose():
                input_shape=(num_samples, stack_size, num_row, num_col),
                fixed_batch_size=True)
 
-    # Test invalid use case
+
+def test_conv2d_transpose_invalid():
+    filters = 2
+    stack_size = 3
+    num_row = 5
+    num_col = 6
+    padding = 'valid'
+
+    with pytest.raises(ValueError):
+        model = Sequential([convolutional.Conv2DTranspose(
+            filters=filters,
+            kernel_size=3,
+            padding=padding,
+            use_bias=True,
+            batch_input_shape=(None, None, 5, None))])
+
+    # Test invalid output padding for given stride. Output padding equal to stride
     with pytest.raises(ValueError):
-        model = Sequential([convolutional.Conv2DTranspose(filters=filters,
-                                                          kernel_size=3,
-                                                          padding=padding,
-                                                          batch_input_shape=(None, None, 5, None))])
+        model = Sequential([convolutional.Conv2DTranspose(
+            filters=filters,
+            kernel_size=3,
+            padding=padding,
+            output_padding=(0, 3),
+            strides=(1, 3),
+            batch_input_shape=(None, num_row, num_col, stack_size))])
+
+    # Output padding greater than stride
+    with pytest.raises(ValueError):
+        model = Sequential([convolutional.Conv2DTranspose(
+            filters=filters,
+            kernel_size=3,
+            padding=padding,
+            output_padding=(2, 2),
+            strides=(1, 3),
+            batch_input_shape=(None, num_row, num_col, stack_size))])
 
 
 @pytest.mark.skipif((K.backend() == 'mxnet'),
                     reason='MXNet backend does not support Separable Conv1D yet.')
-@keras_test
-def test_separable_conv_1d():
+@pytest.mark.parametrize(
+    'padding,strides,multiplier,dilation_rate',
+    [(padding, strides, multiplier, dilation_rate)
+     for padding in _convolution_paddings
+     for strides in [1, 2]
+     for multiplier in [1, 2]
+     for dilation_rate in [1, 2]
+     if (not (padding == 'same' and strides != 1)
+         and not (dilation_rate != 1 and strides != 1)
+         and not (dilation_rate != 1 and K.backend() == 'cntk'))]
+)
+def test_separable_conv_1d(padding, strides, multiplier, dilation_rate):
     num_samples = 2
     filters = 6
     stack_size = 3
     num_step = 9
 
-    for padding in _convolution_paddings:
-        for strides in [1, 2]:
-            for multiplier in [1, 2]:
-                for dilation_rate in [1, 2]:
-                    if padding == 'same' and strides != 1:
-                        continue
-                    if dilation_rate != 1 and strides != 1:
-                        continue
-                    if dilation_rate != 1 and K.backend() == 'cntk':
-                        continue
-
-                    layer_test(convolutional.SeparableConv1D,
-                               kwargs={'filters': filters,
-                                       'kernel_size': 3,
-                                       'padding': padding,
-                                       'strides': strides,
-                                       'depth_multiplier': multiplier,
-                                       'dilation_rate': dilation_rate},
-                               input_shape=(num_samples, num_step, stack_size))
+    layer_test(convolutional.SeparableConv1D,
+               kwargs={'filters': filters,
+                       'kernel_size': 3,
+                       'padding': padding,
+                       'strides': strides,
+                       'depth_multiplier': multiplier,
+                       'dilation_rate': dilation_rate},
+               input_shape=(num_samples, num_step, stack_size))
+
+
+@pytest.mark.skipif(K.backend() == 'mxnet',
+                    reason='MXNet Backend does not support seprable conv 1d yet')
+def test_separable_conv_1d_additional_args():
+    num_samples = 2
+    filters = 6
+    stack_size = 3
+    num_step = 9
+    padding = 'valid'
+    multiplier = 2
 
     layer_test(convolutional.SeparableConv1D,
                kwargs={'filters': filters,
@@ -304,49 +360,62 @@ def test_separable_conv_1d():
                        'depth_multiplier': multiplier},
                input_shape=(num_samples, stack_size, num_step))
 
-    # Test invalid use case
+
+def test_separable_conv_1d_invalid():
+    filters = 6
+    padding = 'valid'
     with pytest.raises(ValueError):
-        model = Sequential([convolutional.SeparableConv1D(filters=filters,
-                                                          kernel_size=3,
-                                                          padding=padding,
-                                                          batch_input_shape=(None, 5, None))])
+        model = Sequential([convolutional.SeparableConv1D(
+            filters=filters, kernel_size=3, padding=padding,
+            batch_input_shape=(None, 5, None))])
+
+
+@pytest.mark.parametrize(
+    'padding,strides,multiplier,dilation_rate',
+    [(padding, strides, multiplier, dilation_rate)
+     for padding in _convolution_paddings
+     for strides in [(1, 1), (2, 2)]
+     for multiplier in [1, 2]
+     for dilation_rate in [(1, 1), (2, 2), (2, 1), (1, 2)]
+     if (not (padding == 'same' and strides != (1, 1))
+         and not (dilation_rate != (1, 1) and strides != (1, 1))
+         and not (dilation_rate != (1, 1) and multiplier == dilation_rate[0])
+         and not (dilation_rate != (1, 1) and K.backend() == 'cntk'))]
+)
+def test_separable_conv_2d(padding, strides, multiplier, dilation_rate):
+    num_samples = 2
+    filters = 6
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+    # MXNet only support multiplier = 1
+    if K.backend() == 'mxnet':
+        multiplier = 1
 
+    layer_test(
+        convolutional.SeparableConv2D,
+        kwargs={'filters': filters,
+                'kernel_size': (3, 3),
+                'padding': padding,
+                'strides': strides,
+                'depth_multiplier': multiplier,
+                'dilation_rate': dilation_rate},
+        input_shape=(num_samples, num_row, num_col, stack_size))
 
-@pytest.mark.skipif((K.backend() == 'mxnet'),
-                    reason='MXNet backend does not support Separable Conv2D yet.')
-@keras_test
-def test_separable_conv_2d():
+
+def test_separable_conv_2d_additional_args():
     num_samples = 2
     filters = 6
     stack_size = 3
     num_row = 7
     num_col = 6
-    # MXNet only support depth_multiplier=1 and strides (2, 2)
-    # TODO: fully support depth_multiplier for depthwise_conv2d
-    depth_multipliers = [1] if K.backend() == 'mxnet' else [1, 2]
-
-    for padding in _convolution_paddings:
-        for strides in [(1, 1), (2, 2)]:
-            for multiplier in depth_multipliers:
-                for dilation_rate in [(1, 1), (2, 2), (2, 1), (1, 2)]:
-                    if padding == 'same' and strides != (1, 1):
-                        continue
-                    if dilation_rate != (1, 1) and strides != (1, 1):
-                        continue
-                    if dilation_rate != (1, 1) and multiplier == dilation_rate[0]:
-                        continue
-                    if dilation_rate != (1, 1) and K.backend() == 'cntk':
-                        continue
-
-                    layer_test(convolutional.SeparableConv2D,
-                               kwargs={'filters': filters,
-                                       'kernel_size': (3, 3),
-                                       'padding': padding,
-                                       'strides': strides,
-                                       'depth_multiplier': multiplier,
-                                       'dilation_rate': dilation_rate,
-                                       'data_format': 'channels_last'},
-                               input_shape=(num_samples, num_row, num_col, stack_size))
+    padding = 'valid'
+    strides = (2, 2)
+    # MXNet backend only supports multiplier = 1
+    if K.backend() == 'mxnet':
+        multiplier = 1
+    else:
+        multiplier = 2
 
     layer_test(convolutional.SeparableConv2D,
                kwargs={'filters': filters,
@@ -363,39 +432,58 @@ def test_separable_conv_2d():
                        'strides': strides,
                        'depth_multiplier': multiplier},
                input_shape=(num_samples, stack_size, num_row, num_col))
-    # Test invalid use case
+
+
+def test_separable_conv_2d_invalid():
+    filters = 6
+    padding = 'valid'
     with pytest.raises(ValueError):
-        model = Sequential([convolutional.SeparableConv2D(filters=filters,
-                                                          kernel_size=3,
-                                                          padding=padding,
-                                                          batch_input_shape=(None, None, 5, None))])
+        model = Sequential([convolutional.SeparableConv2D(
+            filters=filters, kernel_size=3, padding=padding,
+            batch_input_shape=(None, None, 5, None))])
 
 
-@keras_test
-def test_depthwise_conv_2d():
+@pytest.mark.parametrize(
+    'padding,strides,multiplier',
+    [(padding, strides, multiplier)
+     for padding in _convolution_paddings
+     for strides in [(1, 1), (2, 2)]
+     for multiplier in [1, 2]
+     if not (padding == 'same' and strides != (1, 1))]
+)
+def test_depthwise_conv_2d(padding, strides, multiplier):
     num_samples = 2
     stack_size = 3
     num_row = 7
     num_col = 6
     # MXNet only support depth_multiplier=1
     # TODO: fully support depth_multiplier for depthwise_conv2d
-    mutipliers = [1] if K.backend() == 'mxnet' else [1, 2]
-
-    for padding in _convolution_paddings:
-        for strides in [(1, 1), (2, 2)]:
-            for multiplier in mutipliers:
-                if padding == 'same' and strides != (1, 1):
-                    continue
-
-                layer_test(convolutional.DepthwiseConv2D,
-                           kwargs={'kernel_size': (3, 3),
-                                   'padding': padding,
-                                   'strides': strides,
-                                   'depth_multiplier': multiplier},
-                           input_shape=(num_samples,
-                                        num_row,
-                                        num_col,
-                                        stack_size))
+    if K.backend() == 'mxnet':
+        multiplier = 1
+
+    layer_test(convolutional.DepthwiseConv2D,
+               kwargs={'kernel_size': (3, 3),
+                       'padding': padding,
+                       'strides': strides,
+                       'depth_multiplier': multiplier},
+               input_shape=(num_samples,
+                            num_row,
+                            num_col,
+                            stack_size))
+
+
+def test_depthwise_conv_2d_additional_args():
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+    padding = 'valid'
+    strides = (2, 2)
+    # MXNet backend only supports multiplier = 1
+    if K.backend() == 'mxnet':
+        multiplier = 1
+    else:
+        multiplier = 2
 
     layer_test(convolutional.DepthwiseConv2D,
                kwargs={'kernel_size': 3,
@@ -411,7 +499,9 @@ def test_depthwise_conv_2d():
                        'depth_multiplier': multiplier},
                input_shape=(num_samples, stack_size, num_row, num_col))
 
-    # Test invalid use case
+
+def test_depthwise_conv_2d_invalid():
+    padding = 'valid'
     with pytest.raises(ValueError):
         Sequential([convolutional.DepthwiseConv2D(
             kernel_size=3,
@@ -419,80 +509,14 @@ def test_depthwise_conv_2d():
             batch_input_shape=(None, None, 5, None))])
 
 
-@keras_test
-def test_globalpooling_1d():
-    layer_test(pooling.GlobalMaxPooling1D,
-               input_shape=(3, 4, 5))
-    layer_test(pooling.GlobalAveragePooling1D,
-               input_shape=(3, 4, 5))
-
-
-@keras_test
-def test_globalpooling_2d():
-    layer_test(pooling.GlobalMaxPooling2D,
-               kwargs={'data_format': 'channels_first'},
-               input_shape=(3, 4, 5, 6))
-    layer_test(pooling.GlobalMaxPooling2D,
-               kwargs={'data_format': 'channels_last'},
-               input_shape=(3, 5, 6, 4))
-    layer_test(pooling.GlobalAveragePooling2D,
-               kwargs={'data_format': 'channels_first'},
-               input_shape=(3, 4, 5, 6))
-    layer_test(pooling.GlobalAveragePooling2D,
-               kwargs={'data_format': 'channels_last'},
-               input_shape=(3, 5, 6, 4))
-
-
-@keras_test
-def test_globalpooling_3d():
-    layer_test(pooling.GlobalMaxPooling3D,
-               kwargs={'data_format': 'channels_first'},
-               input_shape=(3, 4, 3, 4, 3))
-    layer_test(pooling.GlobalMaxPooling3D,
-               kwargs={'data_format': 'channels_last'},
-               input_shape=(3, 4, 3, 4, 3))
-    layer_test(pooling.GlobalAveragePooling3D,
-               kwargs={'data_format': 'channels_first'},
-               input_shape=(3, 4, 3, 4, 3))
-    layer_test(pooling.GlobalAveragePooling3D,
-               kwargs={'data_format': 'channels_last'},
-               input_shape=(3, 4, 3, 4, 3))
-
-
-@keras_test
-def test_maxpooling_2d():
-    pool_size = (3, 3)
-
-    for strides in [(1, 1), (2, 2)]:
-        layer_test(convolutional.MaxPooling2D,
-                   kwargs={'strides': strides,
-                           'padding': 'valid',
-                           'pool_size': pool_size},
-                   input_shape=(3, 5, 6, 4))
-
-
-@keras_test
-def test_averagepooling_2d():
-    layer_test(convolutional.AveragePooling2D,
-               kwargs={'strides': (2, 2),
-                       'padding': 'same',
-                       'pool_size': (2, 2)},
-               input_shape=(3, 5, 6, 4))
-    layer_test(convolutional.AveragePooling2D,
-               kwargs={'strides': (2, 2),
-                       'padding': 'valid',
-                       'pool_size': (3, 3)},
-               input_shape=(3, 5, 6, 4))
-    layer_test(convolutional.AveragePooling2D,
-               kwargs={'strides': (1, 1),
-                       'padding': 'valid',
-                       'pool_size': (2, 2),
-                       'data_format': 'channels_first'},
-               input_shape=(3, 4, 5, 6))
-
-
-@keras_test
-def test_convolution_3d():
+@pytest.mark.parametrize(
+    'padding,strides',
+    [(padding, strides)
+     for padding in _convolution_paddings
+     for strides in [(1, 1, 1), (2, 2, 2)]
+     if not (padding == 'same' and strides != (1, 1, 1))]
+)
+def test_convolution_3d(padding, strides):
     num_samples = 2
     filters = 2
     stack_size = 3
@@ -501,18 +525,26 @@ def test_convolution_3d():
     input_len_dim2 = 8
     input_len_dim3 = 8
 
-    for padding in _convolution_paddings:
-        for strides in [(1, 1, 1), (2, 2, 2)]:
-            if padding == 'same' and strides != (1, 1, 1):
-                continue
+    layer_test(convolutional.Convolution3D,
+               kwargs={'filters': filters,
+                       'kernel_size': 3,
+                       'padding': padding,
+                       'strides': strides},
+               input_shape=(num_samples,
+                            input_len_dim1, input_len_dim2, input_len_dim3,
+                            stack_size))
+
 
-            layer_test(convolutional.Convolution3D,
-                       kwargs={'filters': filters,
-                               'kernel_size': 3,
-                               'padding': padding,
-                               'strides': strides},
-                       input_shape=(num_samples, stack_size,
-                                    input_len_dim1, input_len_dim2, input_len_dim3))
+def test_convolution_3d_additional_args():
+    num_samples = 2
+    filters = 2
+    stack_size = 3
+    padding = 'valid'
+    strides = (2, 2, 2)
+
+    input_len_dim1 = 9
+    input_len_dim2 = 8
+    input_len_dim3 = 8
 
     layer_test(convolutional.Convolution3D,
                kwargs={'filters': filters,
@@ -530,33 +562,47 @@ def test_convolution_3d():
                             stack_size))
 
 
-@pytest.mark.skipif((K.backend() == 'mxnet'),
-                    reason='MXNet backend does not support Conv3D Transpose on CPU yet.')
-@keras_test
-def test_conv3d_transpose():
+@pytest.mark.skipif(K.backend() == 'mxnet' and K.get_num_gpus() == 0,
+                    reason='MXNet Backend only support conv3d transpose on GPU')
+@pytest.mark.parametrize(
+    'padding,out_padding,strides,data_format',
+    [(padding, out_padding, strides, data_format)
+     for padding in _convolution_paddings
+     for out_padding in [None, (0, 0, 0), (1, 1, 1)]
+     for strides in [(1, 1, 1), (2, 2, 2)]
+     for data_format in ['channels_first', 'channels_last']
+     if (not (padding == 'same' and strides != (1, 1, 1))
+         and not (strides == (1, 1, 1) and out_padding == (1, 1, 1)))]
+)
+def test_conv3d_transpose(padding, out_padding, strides, data_format):
     filters = 2
     stack_size = 3
     num_depth = 7
     num_row = 5
     num_col = 6
 
-    for padding in _convolution_paddings:
-        for out_padding in [None, (0, 0, 0), (1, 1, 1)]:
-            for strides in [(1, 1, 1), (2, 2, 2)]:
-                for data_format in ['channels_first', 'channels_last']:
-                    if padding == 'same' and strides != (1, 1, 1):
-                        continue
-                    if strides == (1, 1, 1) and out_padding == (1, 1, 1):
-                        continue
-                    layer_test(convolutional.Conv3DTranspose,
-                               kwargs={'filters': filters,
-                                       'kernel_size': 3,
-                                       'padding': padding,
-                                       'output_padding': out_padding,
-                                       'strides': strides,
-                                       'data_format': data_format},
-                               input_shape=(None, num_depth, num_row, num_col, stack_size),
-                               fixed_batch_size=True)
+    layer_test(
+        convolutional.Conv3DTranspose,
+        kwargs={'filters': filters,
+                'kernel_size': 3,
+                'padding': padding,
+                'output_padding': out_padding,
+                'strides': strides,
+                'data_format': data_format},
+        input_shape=(None, num_depth, num_row, num_col, stack_size),
+        fixed_batch_size=True)
+
+
+@pytest.mark.skipif(K.backend() == 'mxnet' and K.get_num_gpus() == 0,
+                    reason='MXNet Backend only support conv3d transpose on GPU')
+def test_conv3d_transpose_additional_args():
+    filters = 2
+    stack_size = 3
+    num_depth = 7
+    num_row = 5
+    num_col = 6
+    padding = 'valid'
+    strides = (2, 2, 2)
 
     layer_test(convolutional.Conv3DTranspose,
                kwargs={'filters': filters,
@@ -574,6 +620,17 @@ def test_conv3d_transpose():
                input_shape=(None, stack_size, num_depth, num_row, num_col),
                fixed_batch_size=True)
 
+
+@pytest.mark.skipif(K.backend() == 'mxnet' and K.get_num_gpus() == 0,
+                    reason='MXNet Backend only support conv3d transpose on GPU')
+def test_conv3d_transpose_invalid():
+    filters = 2
+    stack_size = 3
+    num_depth = 7
+    num_row = 5
+    num_col = 6
+    padding = 'valid'
+
     # Test invalid use case
     with pytest.raises(ValueError):
         model = Sequential([convolutional.Conv3DTranspose(
@@ -592,6 +649,7 @@ def test_conv3d_transpose():
             output_padding=(0, 3, 3),
             strides=(1, 3, 4),
             batch_input_shape=(None, num_depth, num_row, num_col, stack_size))])
+
     # Output padding greater than stride
     with pytest.raises(ValueError):
         model = Sequential([convolutional.Conv3DTranspose(
@@ -603,41 +661,6 @@ def test_conv3d_transpose():
             batch_input_shape=(None, num_depth, num_row, num_col, stack_size))])
 
 
-@keras_test
-def test_maxpooling_3d():
-    pool_size = (3, 3, 3)
-
-    layer_test(convolutional.MaxPooling3D,
-               kwargs={'strides': 2,
-                       'padding': 'valid',
-                       'pool_size': pool_size},
-               input_shape=(3, 11, 12, 10, 4))
-    layer_test(convolutional.MaxPooling3D,
-               kwargs={'strides': 3,
-                       'padding': 'valid',
-                       'data_format': 'channels_first',
-                       'pool_size': pool_size},
-               input_shape=(3, 4, 11, 12, 10))
-
-
-@keras_test
-def test_averagepooling_3d():
-    pool_size = (3, 3, 3)
-
-    layer_test(convolutional.AveragePooling3D,
-               kwargs={'strides': 2,
-                       'padding': 'valid',
-                       'pool_size': pool_size},
-               input_shape=(3, 11, 12, 10, 4))
-    layer_test(convolutional.AveragePooling3D,
-               kwargs={'strides': 3,
-                       'padding': 'valid',
-                       'data_format': 'channels_first',
-                       'pool_size': pool_size},
-               input_shape=(3, 4, 11, 12, 10))
-
-
-@keras_test
 def test_zero_padding_1d():
     num_samples = 2
     input_dim = 2
@@ -674,25 +697,36 @@ def test_zero_padding_1d():
     layer.get_config()
 
 
-@keras_test
-def test_zero_padding_2d():
+@pytest.mark.parametrize(
+    'data_format,padding',
+    [(data_format, padding)
+     for data_format in ['channels_first', 'channels_last']
+     for padding in [(2, 2), ((1, 2), (3, 4))]]
+)
+def test_zero_padding_2d(data_format, padding):
     num_samples = 2
     stack_size = 2
     input_num_row = 4
     input_num_col = 5
-    for data_format in ['channels_first', 'channels_last']:
+
+    if data_format == 'channels_last':
         inputs = np.ones((num_samples, input_num_row, input_num_col, stack_size))
+    else:
         inputs = np.ones((num_samples, stack_size, input_num_row, input_num_col))
 
-        # basic test
-        layer_test(convolutional.ZeroPadding2D,
-                   kwargs={'padding': (2, 2), 'data_format': data_format},
-                   input_shape=inputs.shape)
-        layer_test(convolutional.ZeroPadding2D,
-                   kwargs={'padding': ((1, 2), (3, 4)), 'data_format': data_format},
-                   input_shape=inputs.shape)
+    layer_test(convolutional.ZeroPadding2D,
+               kwargs={'padding': padding, 'data_format': data_format},
+               input_shape=inputs.shape)
 
-        # correctness test
+
+def test_zero_padding_2d_correctness():
+    num_samples = 2
+    stack_size = 2
+    input_num_row = 4
+    input_num_col = 5
+    inputs = np.ones((num_samples, stack_size, input_num_row, input_num_col))
+
+    for data_format in ['channels_first', 'channels_last']:
         layer = convolutional.ZeroPadding2D(padding=(2, 2),
                                             data_format=data_format)
         layer.build(inputs.shape)
@@ -736,27 +770,38 @@ def test_zero_padding_2d():
             assert_allclose(np_output[:, :, 1:-2, 3:-4], 1.)
 
 
-def test_zero_padding_3d():
+@pytest.mark.parametrize(
+    'data_format,padding',
+    [(data_format, padding)
+     for data_format in ['channels_first', 'channels_last']
+     for padding in [(2, 2, 2), ((1, 2), (3, 4), (0, 2))]]
+)
+def test_zero_padding_3d(data_format, padding):
     num_samples = 2
     stack_size = 2
     input_len_dim1 = 4
     input_len_dim2 = 5
     input_len_dim3 = 3
-
     inputs = np.ones((num_samples,
                      input_len_dim1, input_len_dim2, input_len_dim3,
                      stack_size))
 
-    # basic test
-    for data_format in ['channels_first', 'channels_last']:
-        layer_test(convolutional.ZeroPadding3D,
-                   kwargs={'padding': (2, 2, 2), 'data_format': data_format},
-                   input_shape=inputs.shape)
-        layer_test(convolutional.ZeroPadding3D,
-                   kwargs={'padding': ((1, 2), (3, 4), (0, 2)), 'data_format': data_format},
-                   input_shape=inputs.shape)
+    layer_test(convolutional.ZeroPadding3D,
+               kwargs={'padding': padding, 'data_format': data_format},
+               input_shape=inputs.shape)
 
-        # correctness test
+
+def test_zero_padding_3d_correctness():
+    num_samples = 2
+    stack_size = 2
+    input_len_dim1 = 4
+    input_len_dim2 = 5
+    input_len_dim3 = 3
+    inputs = np.ones((num_samples,
+                      input_len_dim1, input_len_dim2, input_len_dim3,
+                      stack_size))
+
+    for data_format in ['channels_first', 'channels_last']:
         layer = convolutional.ZeroPadding3D(padding=(2, 2, 2),
                                             data_format=data_format)
         layer.build(inputs.shape)
@@ -798,14 +843,12 @@ def test_zero_padding_3d():
             assert_allclose(np_output[:, :, 1:-2, 3:-4, 0:-2], 1.)
 
 
-@keras_test
 def test_upsampling_1d():
     layer_test(convolutional.UpSampling1D,
                kwargs={'size': 2},
                input_shape=(3, 5, 4))
 
 
-@keras_test
 def test_upsampling_2d():
     num_samples = 2
     stack_size = 2
@@ -851,6 +894,46 @@ def test_upsampling_2d():
                 assert_allclose(np_output, expected_out)
 
 
+@pytest.mark.skipif((K.backend() == 'cntk'),
+                    reason='cntk does not support it yet')
+@pytest.mark.parametrize('data_format',
+                         ['channels_first', 'channels_last'])
+def test_upsampling_2d_bilinear(data_format):
+    num_samples = 2
+    stack_size = 2
+    input_num_row = 11
+    input_num_col = 12
+
+    if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_num_row,
+                                input_num_col)
+    else:  # tf
+        inputs = np.random.rand(num_samples, input_num_row, input_num_col,
+                                stack_size)
+
+    # basic test
+    layer_test(convolutional.UpSampling2D,
+               kwargs={'size': (2, 2),
+                       'data_format': data_format,
+                       'interpolation': 'bilinear'},
+               input_shape=inputs.shape)
+
+    for length_row in [2]:
+        for length_col in [2, 3]:
+            layer = convolutional.UpSampling2D(
+                size=(length_row, length_col),
+                data_format=data_format)
+            layer.build(inputs.shape)
+            outputs = layer(K.variable(inputs))
+            np_output = K.eval(outputs)
+            if data_format == 'channels_first':
+                assert np_output.shape[2] == length_row * input_num_row
+                assert np_output.shape[3] == length_col * input_num_col
+            else:  # tf
+                assert np_output.shape[1] == length_row * input_num_row
+                assert np_output.shape[2] == length_col * input_num_col
+
+
 @pytest.mark.skipif((K.backend() == 'cntk'),
                     reason="cntk does not support it yet")
 def test_upsampling_3d():
@@ -906,7 +989,6 @@ def test_upsampling_3d():
                     assert_allclose(np_output, expected_out)
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'cntk' or K.backend() == 'mxnet'),
                     reason='cntk/mxnet does not support slice to 0 dimension')
 def test_cropping_1d():
@@ -1054,5 +1136,30 @@ def test_cropping_3d():
     with pytest.raises(ValueError):
         layer = convolutional.Cropping3D(cropping=lambda x: x)
 
+
+@pytest.mark.skipif((K.backend() == 'mxnet'),
+                    reason='MXNet backend does not support re-setting weight to float64')
+@pytest.mark.skipif((K.backend() == 'cntk'),
+                    reason='CNTK does not support float64')
+@pytest.mark.parametrize(
+    'input_shape,conv_class',
+    [((2, 4, 2), convolutional.Conv1D),
+     ((2, 4, 4, 2), convolutional.Conv2D),
+     ((2, 4, 4, 4, 2), convolutional.Conv3D)]
+)
+def test_conv_float64(input_shape, conv_class):
+    kernel_size = 3
+    strides = 1
+    filters = 3
+    K.set_floatx('float64')
+    layer_test(conv_class,
+               kwargs={'filters': filters,
+                       'kernel_size': kernel_size,
+                       'padding': 'valid',
+                       'strides': strides},
+               input_shape=input_shape)
+    K.set_floatx('float32')
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/layers/core_test.py b/tests/keras/layers/core_test.py
index c27975ee44d..ba6720d8b0f 100644
--- a/tests/keras/layers/core_test.py
+++ b/tests/keras/layers/core_test.py
@@ -7,20 +7,17 @@
 from keras.models import Model
 from keras.models import Sequential
 from keras.utils.test_utils import layer_test
-from keras.utils.test_utils import keras_test
 from keras import regularizers
 from keras import constraints
 from keras.layers import deserialize as deserialize_layer
 
 
-@keras_test
 def test_masking():
     layer_test(layers.Masking,
                kwargs={},
                input_shape=(3, 2, 3))
 
 
-@keras_test
 def test_dropout():
     layer_test(layers.Dropout,
                kwargs={'rate': 0.5},
@@ -44,20 +41,23 @@ def test_dropout():
                 input_shape = (2,) + shape + (3,)
             else:
                 input_shape = (2, 3) + shape
-            layer_test(layers.SpatialDropout2D if len(shape) == 2 else layers.SpatialDropout3D,
+            if len(shape) == 2:
+                layer = layers.SpatialDropout2D
+            else:
+                layer = layers.SpatialDropout3D
+            layer_test(layer,
                        kwargs={'rate': 0.5,
                                'data_format': data_format},
                        input_shape=input_shape)
 
             # Test invalid use cases
             with pytest.raises(ValueError):
-                layer_test(layers.SpatialDropout2D if len(shape) == 2 else layers.SpatialDropout3D,
+                layer_test(layer,
                            kwargs={'rate': 0.5,
                                    'data_format': 'channels_middle'},
                            input_shape=input_shape)
 
 
-@keras_test
 def test_activation():
     # with string argument
     layer_test(layers.Activation,
@@ -70,7 +70,6 @@ def test_activation():
                input_shape=(3, 2))
 
 
-@keras_test
 def test_reshape():
     layer_test(layers.Reshape,
                kwargs={'target_shape': (8, 1)},
@@ -89,14 +88,12 @@ def test_reshape():
                input_shape=(None, None, 4))
 
 
-@keras_test
 def test_permute():
     layer_test(layers.Permute,
                kwargs={'dims': (2, 1)},
                input_shape=(3, 2, 4))
 
 
-@keras_test
 def test_flatten():
 
     def test_4d():
@@ -157,14 +154,12 @@ def test_5d():
     test_5d()
 
 
-@keras_test
 def test_repeat_vector():
     layer_test(layers.RepeatVector,
                kwargs={'n': 3},
                input_shape=(3, 2))
 
 
-@keras_test
 def test_lambda():
     layer_test(layers.Lambda,
                kwargs={'function': lambda x: x + 1},
@@ -282,7 +277,6 @@ def f_shape(s):
     ld = deserialize_layer({'class_name': 'Lambda', 'config': config})
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'theano'),
                     reason="theano cannot compute "
                            "the output shape automatically.")
@@ -292,7 +286,6 @@ def test_lambda_output_shape():
                input_shape=(3, 2, 4))
 
 
-@keras_test
 def test_dense():
     layer_test(layers.Dense,
                kwargs={'units': 3},
@@ -326,7 +319,6 @@ def test_dense():
     assert len(layer.losses) == 2
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'mxnet'),
                     reason='MXNet backend does not support native functional API yet.')
 def test_activity_regularization():
@@ -347,7 +339,6 @@ def test_activity_regularization():
     model.compile('rmsprop', 'mse')
 
 
-@keras_test
 def test_sequential_as_downstream_of_masking_layer():
 
     inputs = layers.Input(shape=(3, 4))
@@ -365,7 +356,8 @@ def test_sequential_as_downstream_of_masking_layer():
               np.random.random((10, 3, 5)), epochs=1, batch_size=6)
 
     mask_outputs = [model.layers[1].compute_mask(model.layers[1].input)]
-    mask_outputs += [model.layers[2].compute_mask(model.layers[2].input, mask_outputs[-1])]
+    mask_outputs += [model.layers[2].compute_mask(model.layers[2].input,
+                                                  mask_outputs[-1])]
     func = K.function([model.input], mask_outputs)
     mask_outputs_val = func([model_input])
     assert np.array_equal(mask_outputs_val[0], np.any(model_input, axis=-1))
diff --git a/tests/keras/layers/cudnn_recurrent_test.py b/tests/keras/layers/cudnn_recurrent_test.py
index 701cf623eaf..85ae3f1d211 100644
--- a/tests/keras/layers/cudnn_recurrent_test.py
+++ b/tests/keras/layers/cudnn_recurrent_test.py
@@ -4,16 +4,15 @@
 import keras
 import keras.backend as K
 from keras.utils.test_utils import layer_test
-from keras.utils.test_utils import keras_test
 import time
 
 
 skipif_no_tf_gpu = pytest.mark.skipif(
-    (K.backend() != 'tensorflow') or (not K.tensorflow_backend._get_available_gpus()),
+    (K.backend() != 'tensorflow' or
+     not K.tensorflow_backend._get_available_gpus()),
     reason='Requires TensorFlow backend and a GPU')
 
 
-@keras_test
 @skipif_no_tf_gpu
 def test_cudnn_rnn_canonical_to_params_lstm():
     units = 1
@@ -71,7 +70,6 @@ def test_cudnn_rnn_canonical_to_params_lstm():
     assert diff < 1e-8
 
 
-@keras_test
 @skipif_no_tf_gpu
 def test_cudnn_rnn_canonical_to_params_gru():
     units = 7
@@ -121,7 +119,6 @@ def test_cudnn_rnn_canonical_to_params_gru():
     assert diff < 1e-8
 
 
-@keras_test
 @pytest.mark.parametrize('rnn_type', ['lstm', 'gru'], ids=['LSTM', 'GRU'])
 @skipif_no_tf_gpu
 def test_cudnn_rnn_timing(rnn_type):
@@ -160,7 +157,6 @@ def test_cudnn_rnn_timing(rnn_type):
     assert speedup > 3
 
 
-@keras_test
 @skipif_no_tf_gpu
 def test_cudnn_rnn_basics():
     input_size = 10
@@ -188,7 +184,6 @@ def test_cudnn_rnn_basics():
                     input_shape=(num_samples, timesteps, input_size))
 
 
-@keras_test
 @skipif_no_tf_gpu
 def test_trainability():
     input_size = 10
@@ -209,7 +204,6 @@ def test_trainability():
         assert len(layer.non_trainable_weights) == 0
 
 
-@keras_test
 @skipif_no_tf_gpu
 def test_regularizer():
     input_size = 10
@@ -236,7 +230,6 @@ def test_regularizer():
         assert len(layer.get_losses_for(x)) == 1
 
 
-@keras_test
 @skipif_no_tf_gpu
 def test_return_state():
     input_size = 10
@@ -260,7 +253,6 @@ def test_return_state():
             keras.backend.eval(layer.states[0]), state, atol=1e-4)
 
 
-@keras_test
 @skipif_no_tf_gpu
 def test_specify_initial_state_keras_tensor():
     input_size = 10
@@ -289,7 +281,6 @@ def test_specify_initial_state_keras_tensor():
         model.fit([inputs] + initial_state, targets)
 
 
-@keras_test
 @skipif_no_tf_gpu
 def test_statefulness():
     input_size = 10
@@ -336,7 +327,6 @@ def test_statefulness():
         assert(out4.max() != out5.max())
 
 
-@keras_test
 @skipif_no_tf_gpu
 def test_cudnnrnn_bidirectional():
     rnn = keras.layers.CuDNNGRU
diff --git a/tests/keras/layers/embeddings_test.py b/tests/keras/layers/embeddings_test.py
index c315ff128dc..a489f9ac963 100644
--- a/tests/keras/layers/embeddings_test.py
+++ b/tests/keras/layers/embeddings_test.py
@@ -1,10 +1,10 @@
 import pytest
-from keras.utils.test_utils import layer_test, keras_test
+from keras.utils.test_utils import layer_test
 from keras.layers.embeddings import Embedding
+from keras.models import Sequential
 import keras.backend as K
 
 
-@keras_test
 def test_embedding():
     layer_test(Embedding,
                kwargs={'output_dim': 4, 'input_dim': 10, 'input_length': 2},
@@ -22,7 +22,8 @@ def test_embedding():
                input_dtype='int32',
                expected_output_dtype=K.floatx())
     layer_test(Embedding,
-               kwargs={'output_dim': 4, 'input_dim': 10, 'mask_zero': True, 'input_length': (None, 5)},
+               kwargs={'output_dim': 4, 'input_dim': 10, 'mask_zero': True,
+                       'input_length': (None, 5)},
                input_shape=(3, 2, 5),
                input_dtype='int32',
                expected_output_dtype=K.floatx())
@@ -35,5 +36,24 @@ def test_embedding():
                )
 
 
+def test_embedding_invalid():
+
+    # len(input_length) should be equal to len(input_shape) - 1
+    with pytest.raises(ValueError):
+        model = Sequential([Embedding(
+            input_dim=10,
+            output_dim=4,
+            input_length=2,
+            input_shape=(3, 4, 5))])
+
+    # input_length should be equal to input_shape[1:]
+    with pytest.raises(ValueError):
+        model = Sequential([Embedding(
+            input_dim=10,
+            output_dim=4,
+            input_length=2,
+            input_shape=(3, 5))])
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/layers/local_test.py b/tests/keras/layers/local_test.py
index 80e6a6fc036..46921790ec4 100644
--- a/tests/keras/layers/local_test.py
+++ b/tests/keras/layers/local_test.py
@@ -1,7 +1,6 @@
 import pytest
 
 from keras.utils.test_utils import layer_test
-from keras.utils.test_utils import keras_test
 from keras.layers import local
 from keras import backend as K
 
@@ -10,7 +9,6 @@
                                 reason='MXNet backend does not support local_conv1d/2d yet.')
 
 
-@keras_test
 def test_locallyconnected_1d():
     num_samples = 2
     num_steps = 8
@@ -31,7 +29,6 @@ def test_locallyconnected_1d():
                input_shape=(num_samples, num_steps, input_dim))
 
 
-@keras_test
 def test_locallyconnected_2d():
     num_samples = 5
     filters = 3
diff --git a/tests/keras/layers/merge_test.py b/tests/keras/layers/merge_test.py
index fdba7e1b3fe..96c4d52cf39 100644
--- a/tests/keras/layers/merge_test.py
+++ b/tests/keras/layers/merge_test.py
@@ -4,10 +4,10 @@
 from keras import layers
 from keras import models
 from keras import backend as K
-from keras.utils.test_utils import keras_test
+from keras.utils.test_utils import layer_test
+from keras.layers import merge
 
 
-@keras_test
 def test_merge_add():
     i1 = layers.Input(shape=(4, 5))
     i2 = layers.Input(shape=(4, 5))
@@ -40,7 +40,6 @@ def test_merge_add():
         add_layer.compute_mask([i1, i2, i3], [None, None])
 
 
-@keras_test
 def test_merge_subtract():
     i1 = layers.Input(shape=(4, 5))
     i2 = layers.Input(shape=(4, 5))
@@ -75,7 +74,6 @@ def test_merge_subtract():
         subtract_layer([i1])
 
 
-@keras_test
 def test_merge_multiply():
     i1 = layers.Input(shape=(4, 5))
     i2 = layers.Input(shape=(4, 5))
@@ -96,7 +94,6 @@ def test_merge_multiply():
     assert_allclose(out, x1 * x2 * x3, atol=1e-4)
 
 
-@keras_test
 def test_merge_average():
     i1 = layers.Input(shape=(4, 5))
     i2 = layers.Input(shape=(4, 5))
@@ -115,7 +112,6 @@ def test_merge_average():
     assert_allclose(out, 0.5 * (x1 + x2), atol=1e-4)
 
 
-@keras_test
 def test_merge_maximum():
     i1 = layers.Input(shape=(4, 5))
     i2 = layers.Input(shape=(4, 5))
@@ -134,7 +130,6 @@ def test_merge_maximum():
     assert_allclose(out, np.maximum(x1, x2), atol=1e-4)
 
 
-@keras_test
 def test_merge_minimum():
     i1 = layers.Input(shape=(4, 5))
     i2 = layers.Input(shape=(4, 5))
@@ -153,7 +148,6 @@ def test_merge_minimum():
     assert_allclose(out, np.minimum(x1, x2), atol=1e-4)
 
 
-@keras_test
 def test_merge_concatenate():
     i1 = layers.Input(shape=(None, 5))
     i2 = layers.Input(shape=(None, 5))
@@ -206,7 +200,6 @@ def test_merge_concatenate():
         concat_layer([i1])
 
 
-@keras_test
 def test_merge_dot():
     i1 = layers.Input(shape=(4,))
     i2 = layers.Input(shape=(4,))
@@ -238,7 +231,6 @@ def test_merge_dot():
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not allow broadcast for unknown shape in Layers yet')
-@keras_test
 def test_merge_broadcast():
     # shapes provided
     i1 = layers.Input(shape=(4, 5))
@@ -288,5 +280,15 @@ def test_merge_broadcast():
         K.ndim = k_ndim
 
 
+def test_masking_concatenate():
+    input1 = layers.Input(shape=(6,))
+    input2 = layers.Input(shape=(6,))
+    x1 = layers.Embedding(10, 5, input_length=6, mask_zero=True)(input1)
+    x2 = layers.Embedding(10, 5, input_length=6, mask_zero=True)(input2)
+    x = layers.concatenate([x1, x2])
+    x = layers.wrappers.TimeDistributed(layers.Dense(3, activation='softmax'))(x)
+    models.Model(inputs=[input1, input2], outputs=[x])
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/layers/noise_test.py b/tests/keras/layers/noise_test.py
index 2753f02984f..a01a123b140 100644
--- a/tests/keras/layers/noise_test.py
+++ b/tests/keras/layers/noise_test.py
@@ -1,11 +1,9 @@
 import pytest
 from keras.utils.test_utils import layer_test
-from keras.utils.test_utils import keras_test
 from keras.layers import noise
 from keras import backend as K
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'cntk' or K.backend() == 'mxnet'),
                     reason='cntk/mxnet does not support it yet')
 def test_GaussianNoise():
@@ -14,7 +12,6 @@ def test_GaussianNoise():
                input_shape=(3, 2, 3))
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'cntk' or K.backend() == 'mxnet'),
                     reason='cntk/mxnet does not support it yet')
 def test_GaussianDropout():
@@ -23,7 +20,6 @@ def test_GaussianDropout():
                input_shape=(3, 2, 3))
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'cntk' or K.backend() == 'mxnet'),
                     reason='cntk/mxnet does not support it yet')
 def test_AlphaDropout():
diff --git a/tests/keras/layers/normalization_test.py b/tests/keras/layers/normalization_test.py
index e8e68b3a9e1..860d214ec0d 100644
--- a/tests/keras/layers/normalization_test.py
+++ b/tests/keras/layers/normalization_test.py
@@ -4,7 +4,7 @@
 
 from keras.layers import Input
 from keras import regularizers
-from keras.utils.test_utils import layer_test, keras_test
+from keras.utils.test_utils import layer_test
 from keras.layers import normalization
 from keras.models import Sequential, Model
 from keras import backend as K
@@ -16,7 +16,6 @@
 input_shapes = [np.ones((10, 10)), np.ones((10, 10, 10))]
 
 
-@keras_test
 def test_basic_batchnorm():
     layer_test(normalization.BatchNormalization,
                kwargs={'momentum': 0.9,
@@ -28,7 +27,7 @@ def test_basic_batchnorm():
                kwargs={'momentum': 0.9,
                        'epsilon': 0.1,
                        'axis': 1},
-               input_shape=(3, 4, 2))
+               input_shape=(1, 4, 1))
     layer_test(normalization.BatchNormalization,
                kwargs={'gamma_initializer': 'ones',
                        'beta_initializer': 'ones',
@@ -45,8 +44,8 @@ def test_basic_batchnorm():
                    input_shape=(3, 4, 2, 4))
 
 
-@keras_test
 def test_batchnorm_correctness_1d():
+    np.random.seed(1337)
     model = Sequential()
     norm = normalization.BatchNormalization(input_shape=(10,), momentum=0.8)
     model.add(norm)
@@ -63,10 +62,11 @@ def test_batchnorm_correctness_1d():
     assert_allclose(out.std(), 1.0, atol=1e-1)
 
 
-@keras_test
 def test_batchnorm_correctness_2d():
+    np.random.seed(1337)
     model = Sequential()
-    norm = normalization.BatchNormalization(axis=1, input_shape=(10, 6), momentum=0.8)
+    norm = normalization.BatchNormalization(axis=1, input_shape=(10, 6),
+                                            momentum=0.8)
     model.add(norm)
     model.compile(loss='mse', optimizer='rmsprop')
 
@@ -83,15 +83,14 @@ def test_batchnorm_correctness_2d():
 
 @pytest.mark.skipif((K.backend() == 'mxnet'),
                     reason='MXNet backend uses native BatchNorm operator. Do not do updates in the model.')
-@keras_test
 def test_batchnorm_training_argument():
+    np.random.seed(1337)
     bn1 = normalization.BatchNormalization(input_shape=(10,))
     x1 = Input(shape=(10,))
     y1 = bn1(x1, training=True)
     assert bn1.updates
 
     model1 = Model(x1, y1)
-    np.random.seed(123)
     x = np.random.normal(loc=5.0, scale=10.0, size=(20, 10))
     output_a = model1.predict(x)
 
@@ -108,7 +107,6 @@ def test_batchnorm_training_argument():
     assert not bn2.updates
 
 
-@keras_test
 def test_batchnorm_mode_twice():
     # This is a regression test for issue #4881 with the old
     # batch normalization functions in the Theano backend.
@@ -122,10 +120,11 @@ def test_batchnorm_mode_twice():
     model.predict(x)
 
 
-@keras_test
 def test_batchnorm_convnet():
+    np.random.seed(1337)
     model = Sequential()
-    norm = normalization.BatchNormalization(axis=1, input_shape=(3, 4, 4), momentum=0.8)
+    norm = normalization.BatchNormalization(axis=1, input_shape=(3, 4, 4),
+                                            momentum=0.8)
     model.add(norm)
     model.compile(loss='mse', optimizer='sgd')
 
@@ -140,10 +139,10 @@ def test_batchnorm_convnet():
     assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'theano'),
                     reason='Bug with theano backend')
 def test_batchnorm_convnet_no_center_no_scale():
+    np.random.seed(1337)
     model = Sequential()
     norm = normalization.BatchNormalization(axis=-1, center=False, scale=False,
                                             input_shape=(3, 4, 4), momentum=0.8)
@@ -161,7 +160,6 @@ def test_batchnorm_convnet_no_center_no_scale():
 
 @pytest.mark.skipif((K.backend() == 'mxnet'),
                     reason='MXNet backend uses native BatchNorm operator. Do not do updates in the model.')
-@keras_test
 def test_shared_batchnorm():
     '''Test that a BN layer can be shared
     across different data streams.
@@ -191,7 +189,6 @@ def test_shared_batchnorm():
 
 @pytest.mark.skipif((K.backend() == 'mxnet'),
                     reason='MXNet backend uses native BatchNorm operator which does updates in the model.')
-@keras_test
 def test_that_trainable_disables_updates():
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -233,7 +230,6 @@ def test_that_trainable_disables_updates():
 @pytest.mark.skipif((K.backend() == 'mxnet'),
                     reason='MXNet backend uses native BatchNorm operator which does not support'
                            'set weights outside the model.')
-@keras_test
 def test_batchnorm_trainable():
     bn_mean = 0.5
     bn_std = 10.
@@ -252,5 +248,6 @@ def get_model(bn_mean, bn_std):
     out = model.predict(input_4)
     assert_allclose((input_4 - np.mean(input_4)) / np.std(input_4), out, atol=1e-3)
 
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/layers/pooling_test.py b/tests/keras/layers/pooling_test.py
new file mode 100644
index 00000000000..2c3e8616e1e
--- /dev/null
+++ b/tests/keras/layers/pooling_test.py
@@ -0,0 +1,155 @@
+import numpy as np
+import pytest
+
+from keras.utils.test_utils import layer_test
+from keras.layers import pooling
+from keras.layers import Masking
+from keras.layers import convolutional
+from keras.models import Sequential
+
+
+@pytest.mark.parametrize(
+    'padding,stride,data_format',
+    [(padding, stride, data_format)
+     for padding in ['valid', 'same']
+     for stride in [1, 2]
+     for data_format in ['channels_first', 'channels_last']
+     if not (padding == 'same' and stride == 1)]
+)
+def test_maxpooling_1d(padding, stride, data_format):
+    layer_test(convolutional.MaxPooling1D,
+               kwargs={'strides': stride,
+                       'padding': padding,
+                       'data_format': data_format},
+               input_shape=(3, 5, 4))
+
+
+@pytest.mark.parametrize(
+    'strides',
+    [(1, 1), (2, 3)]
+)
+def test_maxpooling_2d(strides):
+    pool_size = (3, 3)
+    layer_test(convolutional.MaxPooling2D,
+               kwargs={'strides': strides,
+                       'padding': 'valid',
+                       'pool_size': pool_size},
+               input_shape=(3, 5, 6, 4))
+
+
+@pytest.mark.parametrize(
+    'strides,data_format,input_shape',
+    [(2, None, (3, 11, 12, 10, 4)),
+     (3, 'channels_first', (3, 4, 11, 12, 10))]
+)
+def test_maxpooling_3d(strides, data_format, input_shape):
+    pool_size = (3, 3, 3)
+    layer_test(convolutional.MaxPooling3D,
+               kwargs={'strides': strides,
+                       'padding': 'valid',
+                       'data_format': data_format,
+                       'pool_size': pool_size},
+               input_shape=input_shape)
+
+
+@pytest.mark.parametrize(
+    'padding,stride,data_format',
+    [(padding, stride, data_format)
+     for padding in ['valid', 'same']
+     for stride in [1, 2]
+     for data_format in ['channels_first', 'channels_last']
+     if not(padding == 'same' and stride == 1)]
+)
+def test_averagepooling_1d(padding, stride, data_format):
+    layer_test(convolutional.AveragePooling1D,
+               kwargs={'strides': stride,
+                       'padding': padding,
+                       'data_format': data_format},
+               input_shape=(3, 5, 4))
+
+
+@pytest.mark.parametrize(
+    'strides,padding,data_format,input_shape',
+    [((2, 2), 'same', None, (3, 5, 6, 4)),
+     ((2, 2), 'valid', None, (3, 5, 6, 4)),
+     ((1, 1), 'valid', 'channels_first', (3, 4, 5, 6))]
+)
+def test_averagepooling_2d(strides, padding, data_format, input_shape):
+    layer_test(convolutional.AveragePooling2D,
+               kwargs={'strides': strides,
+                       'padding': padding,
+                       'pool_size': (2, 2),
+                       'data_format': data_format},
+               input_shape=input_shape)
+
+
+@pytest.mark.parametrize(
+    'strides,data_format,input_shape',
+    [(2, None, (3, 11, 12, 10, 4)),
+     (3, 'channels_first', (3, 4, 11, 12, 10))]
+)
+def test_averagepooling_3d(strides, data_format, input_shape):
+    pool_size = (3, 3, 3)
+
+    layer_test(convolutional.AveragePooling3D,
+               kwargs={'strides': strides,
+                       'padding': 'valid',
+                       'data_format': data_format,
+                       'pool_size': pool_size},
+               input_shape=input_shape)
+
+
+@pytest.mark.parametrize(
+    'data_format,pooling_class',
+    [(data_format, pooling_class)
+     for data_format in ['channels_first', 'channels_last']
+     for pooling_class in [pooling.GlobalMaxPooling1D,
+                           pooling.GlobalAveragePooling1D]]
+)
+def test_globalpooling_1d(data_format, pooling_class):
+    layer_test(pooling_class,
+               kwargs={'data_format': data_format},
+               input_shape=(3, 4, 5))
+
+
+def test_globalpooling_1d_supports_masking():
+    # Test GlobalAveragePooling1D supports masking
+    model = Sequential()
+    model.add(Masking(mask_value=0., input_shape=(3, 4)))
+    model.add(pooling.GlobalAveragePooling1D())
+    model.compile(loss='mae', optimizer='adam')
+
+    model_input = np.random.randint(low=1, high=5, size=(2, 3, 4))
+    model_input[0, 1:, :] = 0
+    output = model.predict(model_input)
+    assert np.array_equal(output[0], model_input[0, 0, :])
+
+
+@pytest.mark.parametrize(
+    'data_format,pooling_class',
+    [(data_format, pooling_class)
+     for data_format in ['channels_first', 'channels_last']
+     for pooling_class in [pooling.GlobalMaxPooling2D,
+                           pooling.GlobalAveragePooling2D]]
+)
+def test_globalpooling_2d(data_format, pooling_class):
+    layer_test(pooling_class,
+               kwargs={'data_format': data_format},
+               input_shape=(3, 4, 5, 6))
+
+
+@pytest.mark.parametrize(
+    'data_format,pooling_class',
+    [(data_format, pooling_class)
+     for data_format in ['channels_first', 'channels_last']
+     for pooling_class in [pooling.GlobalMaxPooling3D,
+                           pooling.GlobalAveragePooling3D]]
+)
+def test_globalpooling_3d(data_format, pooling_class):
+    layer_test(pooling_class,
+               kwargs={'data_format': data_format},
+               input_shape=(3, 4, 3, 4, 3))
+
+
+if __name__ == '__main__':
+    pytest.main([__file__])
diff --git a/tests/keras/layers/recurrent_test.py b/tests/keras/layers/recurrent_test.py
index 232c6d97374..45e41b0bc85 100644
--- a/tests/keras/layers/recurrent_test.py
+++ b/tests/keras/layers/recurrent_test.py
@@ -4,7 +4,6 @@
 
 import keras
 from keras.utils.test_utils import layer_test
-from keras.utils.test_utils import keras_test
 from keras.layers import recurrent
 from keras.layers import embeddings
 from keras.models import Sequential
@@ -18,28 +17,16 @@
 embedding_num = 12
 
 
-@keras_test
-def rnn_test(f):
-    """
-    All the recurrent layers share the same interface,
-    so we can run through them with a single function.
-    """
-    f = keras_test(f)
-    return pytest.mark.parametrize('layer_class', [
-        recurrent.SimpleRNN,
-        recurrent.GRU,
-        recurrent.LSTM
-    ])(f)
+rnn_test = pytest.mark.parametrize('layer_class',
+                                   [recurrent.SimpleRNN,
+                                    recurrent.GRU,
+                                    recurrent.LSTM])
 
 
-@keras_test
-def rnn_cell_test(f):
-    f = keras_test(f)
-    return pytest.mark.parametrize('cell_class', [
-        recurrent.SimpleRNNCell,
-        recurrent.GRUCell,
-        recurrent.LSTMCell
-    ])(f)
+rnn_cell_test = pytest.mark.parametrize('cell_class',
+                                        [recurrent.SimpleRNNCell,
+                                         recurrent.GRUCell,
+                                         recurrent.LSTMCell])
 
 
 @rnn_test
@@ -245,7 +232,6 @@ def test_trainability(layer_class):
     assert len(layer.non_trainable_weights) == 0
 
 
-@keras_test
 def test_masking_layer():
     ''' This test based on a previously failing issue here:
     https://github.com/keras-team/keras/issues/1567
@@ -436,7 +422,6 @@ def test_state_reuse_with_dropout(layer_class):
     outputs = model.predict(inputs)
 
 
-@keras_test
 def test_minimal_rnn_cell_non_layer():
 
     class MinimalRNNCell(object):
@@ -461,18 +446,19 @@ def call(self, inputs, states):
     model.compile(optimizer='rmsprop', loss='mse')
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
-    # Test stacking.
-    cells = [MinimalRNNCell(8, 5),
-             MinimalRNNCell(32, 8),
-             MinimalRNNCell(32, 32)]
-    layer = recurrent.RNN(cells)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+    # MXNet does not support cell stacking yet
+    if K.backend() != 'mxnet':
+        # Test stacking.
+        cells = [MinimalRNNCell(8, 5),
+                 MinimalRNNCell(32, 8),
+                 MinimalRNNCell(32, 32)]
+        layer = recurrent.RNN(cells)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer='rmsprop', loss='mse')
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
 
-@keras_test
 def test_minimal_rnn_cell_non_layer_multiple_states():
 
     class MinimalRNNCell(object):
@@ -500,19 +486,20 @@ def call(self, inputs, states):
     model.compile(optimizer='rmsprop', loss='mse')
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
-    # Test stacking.
-    cells = [MinimalRNNCell(8, 5),
-             MinimalRNNCell(16, 8),
-             MinimalRNNCell(32, 16)]
-    layer = recurrent.RNN(cells)
-    assert layer.cell.state_size == (32, 32, 16, 16, 8, 8)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+    # MXNet does not support cell stacking yet
+    if K.backend() != 'mxnet':
+        # Test stacking.
+        cells = [MinimalRNNCell(8, 5),
+                 MinimalRNNCell(16, 8),
+                 MinimalRNNCell(32, 16)]
+        layer = recurrent.RNN(cells)
+        assert layer.cell.state_size == (8, 8, 16, 16, 32, 32)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer='rmsprop', loss='mse')
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
 
-@keras_test
 def test_minimal_rnn_cell_layer():
 
     class MinimalRNNCell(keras.layers.Layer):
@@ -568,32 +555,32 @@ def get_config(self):
     y_np_2 = model.predict(x_np)
     assert_allclose(y_np, y_np_2, atol=1e-4)
 
-    # Test stacking.
-    cells = [MinimalRNNCell(8),
-             MinimalRNNCell(12),
-             MinimalRNNCell(32)]
-    layer = recurrent.RNN(cells)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test stacked RNN serialization.
-    x_np = np.random.random((6, 5, 5))
-    y_np = model.predict(x_np)
-    weights = model.get_weights()
-    config = layer.get_config()
-    with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-        layer = recurrent.RNN.from_config(config)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.set_weights(weights)
-    y_np_2 = model.predict(x_np)
-    assert_allclose(y_np, y_np_2, atol=1e-4)
+    # MXNet does not support cell stacking yet
+    if K.backend() != 'mxnet':
+        # Test stacking.
+        cells = [MinimalRNNCell(8),
+                 MinimalRNNCell(12),
+                 MinimalRNNCell(32)]
+        layer = recurrent.RNN(cells)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer='rmsprop', loss='mse')
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+        # Test stacked RNN serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
+            layer = recurrent.RNN.from_config(config)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        assert_allclose(y_np, y_np_2, atol=1e-4)
 
 
-@pytest.mark.skipif(K.backend() == 'mxnet',
-                    reason='MXNet backend does not support custom RNN layers yet')
 @rnn_cell_test
 def test_builtin_rnn_cell_layer(cell_class):
     # Test basic case.
@@ -617,34 +604,35 @@ def test_builtin_rnn_cell_layer(cell_class):
     y_np_2 = model.predict(x_np)
     assert_allclose(y_np, y_np_2, atol=1e-4)
 
-    # Test stacking.
-    cells = [cell_class(8),
-             cell_class(12),
-             cell_class(32)]
-    layer = recurrent.RNN(cells)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test stacked RNN serialization.
-    x_np = np.random.random((6, 5, 5))
-    y_np = model.predict(x_np)
-    weights = model.get_weights()
-    config = layer.get_config()
-    layer = recurrent.RNN.from_config(config)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.set_weights(weights)
-    y_np_2 = model.predict(x_np)
-    assert_allclose(y_np, y_np_2, atol=1e-4)
+    # MXNet does not support cell stacking yet
+    if K.backend() != 'mxnet':
+        # Test stacking.
+        cells = [cell_class(8),
+                 cell_class(12),
+                 cell_class(32)]
+        layer = recurrent.RNN(cells)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer='rmsprop', loss='mse')
+        model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+        # Test stacked RNN serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        layer = recurrent.RNN.from_config(config)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        assert_allclose(y_np, y_np_2, atol=1e-4)
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() in ['cntk', 'theano']),
                     reason='Not supported.')
 @pytest.mark.skipif(K.backend() == 'mxnet',
-                    reason='MXNet backend does not support custom RNN layers yet')
+                    reason='MXNet backend does not support stacking RNN cells yet')
 def test_stacked_rnn_dropout():
     cells = [recurrent.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1),
              recurrent.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1)]
@@ -659,7 +647,6 @@ def test_stacked_rnn_dropout():
     model.train_on_batch(x_np, y_np)
 
 
-@keras_test
 def test_stacked_rnn_attributes():
     cells = [recurrent.LSTMCell(3),
              recurrent.LSTMCell(3, kernel_regularizer='l2')]
@@ -682,12 +669,24 @@ def test_stacked_rnn_attributes():
     assert layer.get_losses_for(x) == [y]
 
 
-@keras_test
 def test_stacked_rnn_compute_output_shape():
     cells = [recurrent.LSTMCell(3),
              recurrent.LSTMCell(6)]
     layer = recurrent.RNN(cells, return_state=True, return_sequences=True)
     output_shape = layer.compute_output_shape((None, timesteps, embedding_dim))
+    expected_output_shape = [(None, timesteps, 6),
+                             (None, 3),
+                             (None, 3),
+                             (None, 6),
+                             (None, 6)]
+    assert output_shape == expected_output_shape
+
+    # Test reverse_state_order = True for stacked cell.
+    stacked_cell = recurrent.StackedRNNCells(
+        cells, reverse_state_order=True)
+    layer = recurrent.RNN(
+        stacked_cell, return_state=True, return_sequences=True)
+    output_shape = layer.compute_output_shape((None, timesteps, embedding_dim))
     expected_output_shape = [(None, timesteps, 6),
                              (None, 6),
                              (None, 6),
@@ -710,7 +709,6 @@ def test_batch_size_equal_one(layer_class):
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support custom RNN layers yet')
-@keras_test
 def test_rnn_cell_with_constants_layer():
 
     class RNNCellWithConstants(keras.layers.Layer):
@@ -821,7 +819,6 @@ def get_config(self):
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support custom RNN layers yet')
-@keras_test
 def test_rnn_cell_with_constants_layer_passing_initial_state():
 
     class RNNCellWithConstants(keras.layers.Layer):
@@ -910,5 +907,58 @@ def get_config(self):
     assert_allclose(y_np, y_np_3, atol=1e-4)
 
 
+@rnn_test
+def test_rnn_cell_identity_initializer(layer_class):
+    inputs = Input(shape=(timesteps, embedding_dim))
+    layer = layer_class(units, recurrent_initializer='identity')
+    layer(inputs)
+    recurrent_kernel = layer.get_weights()[1]
+    num_kernels = recurrent_kernel.shape[1] // recurrent_kernel.shape[0]
+    assert np.array_equal(recurrent_kernel,
+                          np.concatenate([np.identity(units)] * num_kernels, axis=1))
+
+
+@pytest.mark.skipif(K.backend() == 'cntk' or K.backend() == 'mxnet', reason='Not supported.')
+def test_inconsistent_output_state_size():
+
+    class PlusOneRNNCell(keras.layers.Layer):
+        """Add one to the input and state.
+
+        This cell is used for testing state_size and output_size."""
+
+        def __init__(self, num_unit, **kwargs):
+            self.state_size = num_unit
+            super(PlusOneRNNCell, self).__init__(**kwargs)
+
+        def build(self, input_shape):
+            self.output_size = input_shape[-1]
+
+        def call(self, inputs, states):
+            return inputs + 1, [states[0] + 1]
+
+    batch = 32
+    time_step = 4
+    state_size = 5
+    input_size = 6
+    cell = PlusOneRNNCell(state_size)
+    x = keras.Input((None, input_size))
+    layer = recurrent.RNN(cell)
+    y = layer(x)
+
+    assert cell.state_size == state_size
+    init_state = layer.get_initial_state(x)
+    assert len(init_state) == 1
+    if K.backend() != 'theano':
+        # theano does not support static shape inference.
+        assert K.int_shape(init_state[0]) == (None, state_size)
+
+    model = keras.models.Model(x, y)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(
+        np.zeros((batch, time_step, input_size)),
+        np.zeros((batch, input_size)))
+    assert model.output_shape == (None, input_size)
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/layers/wrappers_test.py b/tests/keras/layers/wrappers_test.py
index 3bfc8241a4b..d9c4481e6f2 100644
--- a/tests/keras/layers/wrappers_test.py
+++ b/tests/keras/layers/wrappers_test.py
@@ -2,7 +2,6 @@
 import numpy as np
 import copy
 from numpy.testing import assert_allclose
-from keras.utils.test_utils import keras_test
 from keras.utils import CustomObjectScope
 from keras.layers import wrappers, Input, Layer
 from keras.layers import RNN
@@ -12,11 +11,8 @@
 from keras.utils.generic_utils import object_list_uid, to_list
 
 
-pytestmark = pytest.mark.skipif(K.backend() == 'mxnet',
-                                reason='MXNet backend does not support TimeDistributed and RNN yet')
-
-
-@keras_test
+@pytest.mark.skipif(K.backend() == 'mxnet',
+                    reason='MXNet backend does not support TimeDistributed and RNN yet')
 def test_TimeDistributed():
     # first, test with Dense layer
     model = Sequential()
@@ -131,7 +127,8 @@ def test_TimeDistributed():
     assert K.int_shape(td._input_map[uid]) == (None, 2)
 
 
-@keras_test
+@pytest.mark.skipif(K.backend() == 'mxnet',
+                    reason='MXNet backend does not support TimeDistributed and RNN yet')
 @pytest.mark.skipif((K.backend() == 'cntk'),
                     reason='Flaky with CNTK backend')
 def test_TimeDistributed_learning_phase():
@@ -144,7 +141,8 @@ def test_TimeDistributed_learning_phase():
     assert_allclose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
 
 
-@keras_test
+@pytest.mark.skipif(K.backend() == 'mxnet',
+                    reason='MXNet backend does not support TimeDistributed and RNN yet')
 def test_TimeDistributed_trainable():
     # test layers that need learning_phase to be set
     x = Input(shape=(3, 2))
@@ -160,14 +158,14 @@ def test_TimeDistributed_trainable():
     assert len(layer.trainable_weights) == 2
 
 
-@keras_test
-@pytest.mark.skipif((K.backend() == 'cntk'),
-                    reason='Unknown timestamps for RNN not supported in CNTK.')
+@pytest.mark.skipif((K.backend() == 'cntk' or K.backend() == 'mxnet'),
+                    reason='Unknown timestamps for RNN not supported in CNTK and MXNet.')
 def test_TimeDistributed_with_masked_embedding_and_unspecified_shape():
     # test with unspecified shape and Embeddings with mask_zero
     model = Sequential()
     model.add(wrappers.TimeDistributed(layers.Embedding(5, 6, mask_zero=True),
-                                       input_shape=(None, None)))  # N by t_1 by t_2 by 6
+                                       input_shape=(None, None)))
+    # the shape so far: (N, t_1, t_2, 6)
     model.add(wrappers.TimeDistributed(layers.SimpleRNN(7, return_sequences=True)))
     model.add(wrappers.TimeDistributed(layers.SimpleRNN(8, return_sequences=False)))
     model.add(layers.SimpleRNN(1, return_sequences=False))
@@ -191,7 +189,8 @@ def test_TimeDistributed_with_masked_embedding_and_unspecified_shape():
     assert mask_outputs[-1] is None  # final layer
 
 
-@keras_test
+@pytest.mark.skipif(K.backend() == 'mxnet',
+                    reason='MXNet backend does not support TimeDistributed and RNN yet')
 def test_TimeDistributed_with_masking_layer():
     # test with Masking layer
     model = Sequential()
@@ -206,14 +205,14 @@ def test_TimeDistributed_with_masking_layer():
     model.fit(model_input,
               np.random.random((10, 3, 5)), epochs=1, batch_size=6)
     mask_outputs = [model.layers[0].compute_mask(model.input)]
-    mask_outputs += [model.layers[1].compute_mask(model.layers[1].input, mask_outputs[-1])]
+    mask_outputs += [model.layers[1].compute_mask(model.layers[1].input,
+                                                  mask_outputs[-1])]
     func = K.function([model.input], mask_outputs)
     mask_outputs_val = func([model_input])
     assert np.array_equal(mask_outputs_val[0], np.any(model_input, axis=-1))
     assert np.array_equal(mask_outputs_val[1], np.any(model_input, axis=-1))
 
 
-@keras_test
 def test_regularizers():
     model = Sequential()
     model.add(wrappers.TimeDistributed(
@@ -233,7 +232,6 @@ def test_regularizers():
     assert len(model.losses) == 1
 
 
-@keras_test
 def test_Bidirectional():
     rnn = layers.SimpleRNN
     samples = 2
@@ -288,7 +286,6 @@ def test_Bidirectional():
         model.fit(x, y, epochs=1, batch_size=1)
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'cntk'),
                     reason='Unknown timestamps not supported in CNTK.')
 def test_Bidirectional_dynamic_timesteps():
@@ -313,7 +310,6 @@ def test_Bidirectional_dynamic_timesteps():
         model.fit(x, y, epochs=1, batch_size=1)
 
 
-@keras_test
 @pytest.mark.parametrize('merge_mode', ['sum', 'mul', 'ave', 'concat', None])
 def test_Bidirectional_merged_value(merge_mode):
     rnn = layers.LSTM
@@ -336,10 +332,12 @@ def test_Bidirectional_merged_value(merge_mode):
 
     # basic case
     inputs = Input((timesteps, dim))
-    layer = wrappers.Bidirectional(rnn(units, return_sequences=True), merge_mode=merge_mode)
+    layer = wrappers.Bidirectional(rnn(units, return_sequences=True),
+                                   merge_mode=merge_mode)
     f_merged = K.function([inputs], to_list(layer(inputs)))
     f_forward = K.function([inputs], [layer.forward_layer.call(inputs)])
-    f_backward = K.function([inputs], [K.reverse(layer.backward_layer.call(inputs), 1)])
+    f_backward = K.function([inputs],
+                            [K.reverse(layer.backward_layer.call(inputs), 1)])
 
     y_merged = f_merged(X)
     y_expected = to_list(merge_func(f_forward(X)[0], f_backward(X)[0]))
@@ -349,7 +347,8 @@ def test_Bidirectional_merged_value(merge_mode):
 
     # test return_state
     inputs = Input((timesteps, dim))
-    layer = wrappers.Bidirectional(rnn(units, return_state=True), merge_mode=merge_mode)
+    layer = wrappers.Bidirectional(rnn(units, return_state=True),
+                                   merge_mode=merge_mode)
     f_merged = K.function([inputs], layer(inputs))
     f_forward = K.function([inputs], layer.forward_layer.call(inputs))
     f_backward = K.function([inputs], layer.backward_layer.call(inputs))
@@ -371,8 +370,7 @@ def test_Bidirectional_merged_value(merge_mode):
         assert_allclose(state_birnn, state_inner, atol=1e-5)
 
 
-@keras_test
-@pytest.mark.skipif(K.backend() == 'theano', reason='Not supported.')
+@pytest.mark.skipif(K.backend() == 'theano' or K.backend() == 'mxnet', reason='Not supported.')
 @pytest.mark.parametrize('merge_mode', ['sum', 'concat', None])
 def test_Bidirectional_dropout(merge_mode):
     rnn = layers.LSTM
@@ -402,7 +400,6 @@ def test_Bidirectional_dropout(merge_mode):
         assert_allclose(x1, x2, atol=1e-5)
 
 
-@keras_test
 def test_Bidirectional_state_reuse():
     rnn = layers.LSTM
     samples = 2
@@ -411,7 +408,8 @@ def test_Bidirectional_state_reuse():
     units = 3
 
     input1 = Input((timesteps, dim))
-    layer = wrappers.Bidirectional(rnn(units, return_state=True, return_sequences=True))
+    layer = wrappers.Bidirectional(rnn(units, return_state=True,
+                                       return_sequences=True))
     state = layer(input1)[1:]
 
     # test passing invalid initial_state: passing a tensor
@@ -429,7 +427,8 @@ def test_Bidirectional_state_reuse():
     outputs = model.predict(inputs)
 
 
-@keras_test
+@pytest.mark.skipif(K.backend() == 'mxnet',
+                    reason='MXNet backend does not support custom RNN cell yet')
 def test_Bidirectional_with_constants():
     class RNNCellWithConstants(Layer):
         def __init__(self, units, **kwargs):
@@ -510,7 +509,8 @@ def get_config(self):
     assert_allclose(y_np, y_np_3, atol=1e-4)
 
 
-@keras_test
+@pytest.mark.skipif(K.backend() == 'mxnet',
+                    reason='MXNet backend does not support custom RNN cell yet')
 def test_Bidirectional_with_constants_layer_passing_initial_state():
     class RNNCellWithConstants(Layer):
         def __init__(self, units, **kwargs):
@@ -565,7 +565,8 @@ def get_config(self):
     model = Model([x, s_for, s_bac, c], y)
     model.compile(optimizer='rmsprop', loss='mse')
     model.train_on_batch(
-        [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 32)), np.zeros((6, 3))],
+        [np.zeros((6, 5, 5)), np.zeros((6, 32)),
+         np.zeros((6, 32)), np.zeros((6, 3))],
         np.zeros((6, 64))
     )
 
@@ -600,7 +601,6 @@ def get_config(self):
     assert_allclose(y_np, y_np_3, atol=1e-4)
 
 
-@keras_test
 def test_Bidirectional_trainable():
     # test layers that need learning_phase to be set
     x = Input(shape=(3, 2))
@@ -613,7 +613,6 @@ def test_Bidirectional_trainable():
     assert len(layer.trainable_weights) == 6
 
 
-@keras_test
 def test_Bidirectional_updates():
     x = Input(shape=(3, 2))
     layer = wrappers.Bidirectional(layers.SimpleRNN(3))
@@ -629,7 +628,6 @@ def test_Bidirectional_updates():
     assert len(layer.get_updates_for(x)) == 2
 
 
-@keras_test
 def test_Bidirectional_losses():
     x = Input(shape=(3, 2))
     layer = wrappers.Bidirectional(
diff --git a/tests/keras/legacy/interface_test.py b/tests/keras/legacy/interface_test.py
index 190946543a1..7f8d7d32a81 100644
--- a/tests/keras/legacy/interface_test.py
+++ b/tests/keras/legacy/interface_test.py
@@ -1,12 +1,10 @@
 import pytest
 import json
-from keras.utils.test_utils import keras_test
 import keras
 import numpy as np
 from keras import backend as K
 
 
-@keras_test
 def test_dense_legacy_interface():
     old_layer = keras.layers.Dense(input_dim=3, output_dim=2, name='d')
     new_layer = keras.layers.Dense(2, input_shape=(3,), name='d')
@@ -30,16 +28,14 @@ def test_dense_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_dropout_legacy_interface():
     old_layer = keras.layers.Dropout(p=3, name='drop')
-    new_layer_1 = keras.layers.Dropout(rate=3, name='drop')
-    new_layer_2 = keras.layers.Dropout(3, name='drop')
-    assert json.dumps(old_layer.get_config()) == json.dumps(new_layer_1.get_config())
-    assert json.dumps(old_layer.get_config()) == json.dumps(new_layer_2.get_config())
+    new_layer1 = keras.layers.Dropout(rate=3, name='drop')
+    new_layer2 = keras.layers.Dropout(3, name='drop')
+    assert json.dumps(old_layer.get_config()) == json.dumps(new_layer1.get_config())
+    assert json.dumps(old_layer.get_config()) == json.dumps(new_layer2.get_config())
 
 
-@keras_test
 def test_embedding_legacy_interface():
     old_layer = keras.layers.Embedding(4, 2, name='d')
     new_layer = keras.layers.Embedding(output_dim=2, input_dim=4, name='d')
@@ -60,7 +56,6 @@ def test_embedding_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_maxpooling1d_legacy_interface():
     old_layer = keras.layers.MaxPool1D(pool_length=2,
                                        border_mode='valid',
@@ -77,7 +72,6 @@ def test_maxpooling1d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_avgpooling1d_legacy_interface():
     old_layer = keras.layers.AvgPool1D(pool_length=2,
                                        border_mode='valid',
@@ -90,27 +84,25 @@ def test_avgpooling1d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_prelu_legacy_interface():
     old_layer = keras.layers.PReLU(init='zero', name='p')
     new_layer = keras.layers.PReLU('zero', name='p')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_gaussiannoise_legacy_interface():
     old_layer = keras.layers.GaussianNoise(sigma=0.5, name='gn')
     new_layer = keras.layers.GaussianNoise(stddev=0.5, name='gn')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_lstm_legacy_interface():
     old_layer = keras.layers.LSTM(input_shape=[3, 5], output_dim=2, name='d')
     new_layer = keras.layers.LSTM(2, input_shape=[3, 5], name='d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.LSTM(input_shape=[3, 5], output_dim=2, name='d', consume_less='mem')
+    old_layer = keras.layers.LSTM(input_shape=[3, 5], output_dim=2, name='d',
+                                  consume_less='mem')
     new_layer = keras.layers.LSTM(2, input_shape=[3, 5], name='d', implementation=1)
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
@@ -121,10 +113,12 @@ def test_lstm_legacy_interface():
 
     old_layer = keras.layers.LSTM(input_dim=5,
                                   output_dim=2, name='d', consume_less='mem')
-    new_layer = keras.layers.LSTM(2, input_shape=[None, 5], name='d', implementation=1)
+    new_layer = keras.layers.LSTM(2, input_shape=[None, 5], name='d',
+                                  implementation=1)
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.LSTM(input_shape=[3, 5], output_dim=2, name='d', consume_less='gpu')
+    old_layer = keras.layers.LSTM(input_shape=[3, 5], output_dim=2, name='d',
+                                  consume_less='gpu')
     new_layer = keras.layers.LSTM(2, input_shape=[3, 5], name='d', implementation=2)
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
@@ -177,7 +171,6 @@ def test_lstm_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_simplernn_legacy_interface():
     old_layer = keras.layers.SimpleRNN(input_shape=[3, 5], output_dim=2, name='d')
     new_layer = keras.layers.SimpleRNN(2, input_shape=[3, 5], name='d')
@@ -202,7 +195,6 @@ def test_simplernn_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_gru_legacy_interface():
     old_layer = keras.layers.GRU(input_shape=[3, 5], output_dim=2, name='d')
     new_layer = keras.layers.GRU(2, input_shape=[3, 5], name='d')
@@ -229,168 +221,225 @@ def test_gru_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_gaussiandropout_legacy_interface():
     old_layer = keras.layers.GaussianDropout(p=0.6, name='drop')
-    new_layer_1 = keras.layers.GaussianDropout(rate=0.6, name='drop')
-    new_layer_2 = keras.layers.GaussianDropout(0.6, name='drop')
-    assert json.dumps(old_layer.get_config()) == json.dumps(new_layer_1.get_config())
-    assert json.dumps(old_layer.get_config()) == json.dumps(new_layer_2.get_config())
+    new_layer1 = keras.layers.GaussianDropout(rate=0.6, name='drop')
+    new_layer2 = keras.layers.GaussianDropout(0.6, name='drop')
+    assert json.dumps(old_layer.get_config()) == json.dumps(new_layer1.get_config())
+    assert json.dumps(old_layer.get_config()) == json.dumps(new_layer2.get_config())
 
 
-@keras_test
 def test_maxpooling2d_legacy_interface():
-    old_layer = keras.layers.MaxPooling2D(pool_size=(2, 2), border_mode='valid', name='maxpool2d')
-    new_layer = keras.layers.MaxPool2D(pool_size=2, padding='valid', name='maxpool2d')
+    old_layer = keras.layers.MaxPooling2D(
+        pool_size=(2, 2), border_mode='valid', name='maxpool2d')
+    new_layer = keras.layers.MaxPool2D(
+        pool_size=2, padding='valid', name='maxpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
     old_layer = keras.layers.MaxPooling2D((2, 2), 2, 'valid', name='maxpool2d')
-    new_layer = keras.layers.MaxPool2D(pool_size=2, strides=2, padding='valid', name='maxpool2d')
+    new_layer = keras.layers.MaxPool2D(
+        pool_size=2, strides=2, padding='valid', name='maxpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.MaxPooling2D((2, 2), padding='valid', dim_ordering='tf', name='maxpool2d')
-    new_layer = keras.layers.MaxPool2D(pool_size=2, padding='valid', data_format='channels_last', name='maxpool2d')
+    old_layer = keras.layers.MaxPooling2D(
+        (2, 2), padding='valid', dim_ordering='tf', name='maxpool2d')
+    new_layer = keras.layers.MaxPool2D(
+        pool_size=2, padding='valid', data_format='channels_last', name='maxpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.MaxPooling2D((2, 2), padding='valid', dim_ordering='th', name='maxpool2d')
-    new_layer = keras.layers.MaxPool2D(pool_size=2, padding='valid', data_format='channels_first', name='maxpool2d')
+    old_layer = keras.layers.MaxPooling2D(
+        (2, 2), padding='valid', dim_ordering='th', name='maxpool2d')
+    new_layer = keras.layers.MaxPool2D(
+        pool_size=2, padding='valid', data_format='channels_first',
+        name='maxpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.MaxPooling2D((2, 2), padding='valid', dim_ordering='default', name='maxpool2d')
-    new_layer = keras.layers.MaxPool2D(pool_size=2, padding='valid', name='maxpool2d')
+    old_layer = keras.layers.MaxPooling2D(
+        (2, 2), padding='valid', dim_ordering='default', name='maxpool2d')
+    new_layer = keras.layers.MaxPool2D(
+        pool_size=2, padding='valid', name='maxpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_avgpooling2d_legacy_interface():
-    old_layer = keras.layers.AveragePooling2D(pool_size=(2, 2), border_mode='valid', name='avgpooling2d')
-    new_layer = keras.layers.AvgPool2D(pool_size=(2, 2), padding='valid', name='avgpooling2d')
+    old_layer = keras.layers.AveragePooling2D(
+        pool_size=(2, 2), border_mode='valid', name='avgpooling2d')
+    new_layer = keras.layers.AvgPool2D(
+        pool_size=(2, 2), padding='valid', name='avgpooling2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.AveragePooling2D((2, 2), (2, 2), 'valid', name='avgpooling2d')
-    new_layer = keras.layers.AvgPool2D(pool_size=(2, 2), strides=(2, 2), padding='valid', name='avgpooling2d')
+    old_layer = keras.layers.AveragePooling2D(
+        (2, 2), (2, 2), 'valid', name='avgpooling2d')
+    new_layer = keras.layers.AvgPool2D(
+        pool_size=(2, 2), strides=(2, 2), padding='valid', name='avgpooling2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.AveragePooling2D((2, 2), padding='valid', dim_ordering='tf', name='avgpooling2d')
-    new_layer = keras.layers.AvgPool2D(pool_size=2, padding='valid', data_format='channels_last', name='avgpooling2d')
+    old_layer = keras.layers.AveragePooling2D(
+        (2, 2), padding='valid', dim_ordering='tf', name='avgpooling2d')
+    new_layer = keras.layers.AvgPool2D(
+        pool_size=2, padding='valid', data_format='channels_last',
+        name='avgpooling2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.AveragePooling2D((2, 2), padding='valid', dim_ordering='th', name='avgpooling2d')
-    new_layer = keras.layers.AvgPool2D(pool_size=2, padding='valid', data_format='channels_first', name='avgpooling2d')
+    old_layer = keras.layers.AveragePooling2D(
+        (2, 2), padding='valid', dim_ordering='th', name='avgpooling2d')
+    new_layer = keras.layers.AvgPool2D(
+        pool_size=2, padding='valid', data_format='channels_first',
+        name='avgpooling2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.AveragePooling2D((2, 2), padding='valid', dim_ordering='default', name='avgpooling2d')
-    new_layer = keras.layers.AvgPool2D(pool_size=2, padding='valid', name='avgpooling2d')
+    old_layer = keras.layers.AveragePooling2D(
+        (2, 2), padding='valid', dim_ordering='default', name='avgpooling2d')
+    new_layer = keras.layers.AvgPool2D(
+        pool_size=2, padding='valid', name='avgpooling2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_maxpooling3d_legacy_interface():
-    old_layer = keras.layers.MaxPooling3D(pool_size=(2, 2, 2), border_mode='valid', name='maxpool3d')
-    new_layer = keras.layers.MaxPool3D(pool_size=(2, 2, 2), padding='valid', name='maxpool3d')
+    old_layer = keras.layers.MaxPooling3D(
+        pool_size=(2, 2, 2), border_mode='valid', name='maxpool3d')
+    new_layer = keras.layers.MaxPool3D(
+        pool_size=(2, 2, 2), padding='valid', name='maxpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.MaxPooling3D((2, 2, 2), (2, 2, 2), 'valid', name='maxpool3d')
-    new_layer = keras.layers.MaxPool3D(pool_size=(2, 2, 2), strides=(2, 2, 2), padding='valid', name='maxpool3d')
+    old_layer = keras.layers.MaxPooling3D(
+        (2, 2, 2), (2, 2, 2), 'valid', name='maxpool3d')
+    new_layer = keras.layers.MaxPool3D(
+        pool_size=(2, 2, 2), strides=(2, 2, 2), padding='valid', name='maxpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.MaxPooling3D((2, 2, 2), padding='valid', dim_ordering='tf', name='maxpool3d')
-    new_layer = keras.layers.MaxPool3D(pool_size=(2, 2, 2), padding='valid', data_format='channels_last', name='maxpool3d')
+    old_layer = keras.layers.MaxPooling3D(
+        (2, 2, 2), padding='valid', dim_ordering='tf', name='maxpool3d')
+    new_layer = keras.layers.MaxPool3D(
+        pool_size=(2, 2, 2), padding='valid', data_format='channels_last',
+        name='maxpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.MaxPooling3D((2, 2, 2), padding='valid', dim_ordering='th', name='maxpool3d')
-    new_layer = keras.layers.MaxPool3D(pool_size=(2, 2, 2), padding='valid', data_format='channels_first', name='maxpool3d')
+    old_layer = keras.layers.MaxPooling3D(
+        (2, 2, 2), padding='valid', dim_ordering='th', name='maxpool3d')
+    new_layer = keras.layers.MaxPool3D(
+        pool_size=(2, 2, 2), padding='valid', data_format='channels_first',
+        name='maxpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.MaxPooling3D((2, 2, 2), padding='valid', dim_ordering='default', name='maxpool3d')
-    new_layer = keras.layers.MaxPool3D(pool_size=(2, 2, 2), padding='valid', name='maxpool3d')
+    old_layer = keras.layers.MaxPooling3D(
+        (2, 2, 2), padding='valid', dim_ordering='default', name='maxpool3d')
+    new_layer = keras.layers.MaxPool3D(
+        pool_size=(2, 2, 2), padding='valid', name='maxpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_avgpooling3d_legacy_interface():
-    old_layer = keras.layers.AveragePooling3D(pool_size=(2, 2, 2), border_mode='valid', name='avgpooling3d')
-    new_layer = keras.layers.AvgPool3D(pool_size=(2, 2, 2), padding='valid', name='avgpooling3d')
+    old_layer = keras.layers.AveragePooling3D(
+        pool_size=(2, 2, 2), border_mode='valid', name='avgpooling3d')
+    new_layer = keras.layers.AvgPool3D(
+        pool_size=(2, 2, 2), padding='valid', name='avgpooling3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.AveragePooling3D((2, 2, 2), (2, 2, 2), 'valid', name='avgpooling3d')
-    new_layer = keras.layers.AvgPool3D(pool_size=(2, 2, 2), strides=(2, 2, 2), padding='valid', name='avgpooling3d')
+    old_layer = keras.layers.AveragePooling3D(
+        (2, 2, 2), (2, 2, 2), 'valid', name='avgpooling3d')
+    new_layer = keras.layers.AvgPool3D(
+        pool_size=(2, 2, 2), strides=(2, 2, 2), padding='valid',
+        name='avgpooling3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.AveragePooling3D((2, 2, 2), padding='valid', dim_ordering='tf', name='avgpooling3d')
-    new_layer = keras.layers.AvgPool3D(pool_size=(2, 2, 2), padding='valid', data_format='channels_last', name='avgpooling3d')
+    old_layer = keras.layers.AveragePooling3D(
+        (2, 2, 2), padding='valid', dim_ordering='tf', name='avgpooling3d')
+    new_layer = keras.layers.AvgPool3D(
+        pool_size=(2, 2, 2), padding='valid', data_format='channels_last',
+        name='avgpooling3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.AveragePooling3D((2, 2, 2), padding='valid', dim_ordering='th', name='avgpooling3d')
-    new_layer = keras.layers.AvgPool3D(pool_size=(2, 2, 2), padding='valid', data_format='channels_first', name='avgpooling3d')
+    old_layer = keras.layers.AveragePooling3D(
+        (2, 2, 2), padding='valid', dim_ordering='th', name='avgpooling3d')
+    new_layer = keras.layers.AvgPool3D(
+        pool_size=(2, 2, 2), padding='valid', data_format='channels_first',
+        name='avgpooling3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.AveragePooling3D((2, 2, 2), padding='valid', dim_ordering='default', name='avgpooling3d')
-    new_layer = keras.layers.AvgPool3D(pool_size=(2, 2, 2), padding='valid', name='avgpooling3d')
+    old_layer = keras.layers.AveragePooling3D(
+        (2, 2, 2), padding='valid', dim_ordering='default', name='avgpooling3d')
+    new_layer = keras.layers.AvgPool3D(
+        pool_size=(2, 2, 2), padding='valid', name='avgpooling3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_global_maxpooling2d_legacy_interface():
-    old_layer = keras.layers.GlobalMaxPooling2D(dim_ordering='tf', name='global_maxpool2d')
-    new_layer = keras.layers.GlobalMaxPool2D(data_format='channels_last', name='global_maxpool2d')
+    old_layer = keras.layers.GlobalMaxPooling2D(dim_ordering='tf',
+                                                name='global_maxpool2d')
+    new_layer = keras.layers.GlobalMaxPool2D(data_format='channels_last',
+                                             name='global_maxpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.GlobalMaxPooling2D(dim_ordering='th', name='global_maxpool2d')
-    new_layer = keras.layers.GlobalMaxPool2D(data_format='channels_first', name='global_maxpool2d')
+    old_layer = keras.layers.GlobalMaxPooling2D(dim_ordering='th',
+                                                name='global_maxpool2d')
+    new_layer = keras.layers.GlobalMaxPool2D(data_format='channels_first',
+                                             name='global_maxpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.GlobalMaxPooling2D(dim_ordering='default', name='global_maxpool2d')
+    old_layer = keras.layers.GlobalMaxPooling2D(dim_ordering='default',
+                                                name='global_maxpool2d')
     new_layer = keras.layers.GlobalMaxPool2D(name='global_maxpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_global_avgpooling2d_legacy_interface():
-    old_layer = keras.layers.GlobalAveragePooling2D(dim_ordering='tf', name='global_avgpool2d')
-    new_layer = keras.layers.GlobalAvgPool2D(data_format='channels_last', name='global_avgpool2d')
+    old_layer = keras.layers.GlobalAveragePooling2D(dim_ordering='tf',
+                                                    name='global_avgpool2d')
+    new_layer = keras.layers.GlobalAvgPool2D(data_format='channels_last',
+                                             name='global_avgpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.GlobalAveragePooling2D(dim_ordering='th', name='global_avgpool2d')
-    new_layer = keras.layers.GlobalAvgPool2D(data_format='channels_first', name='global_avgpool2d')
+    old_layer = keras.layers.GlobalAveragePooling2D(dim_ordering='th',
+                                                    name='global_avgpool2d')
+    new_layer = keras.layers.GlobalAvgPool2D(data_format='channels_first',
+                                             name='global_avgpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.GlobalAveragePooling2D(dim_ordering='default', name='global_avgpool2d')
+    old_layer = keras.layers.GlobalAveragePooling2D(dim_ordering='default',
+                                                    name='global_avgpool2d')
     new_layer = keras.layers.GlobalAvgPool2D(name='global_avgpool2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_global_maxpooling3d_legacy_interface():
-    old_layer = keras.layers.GlobalMaxPooling3D(dim_ordering='tf', name='global_maxpool3d')
-    new_layer = keras.layers.GlobalMaxPool3D(data_format='channels_last', name='global_maxpool3d')
+    old_layer = keras.layers.GlobalMaxPooling3D(dim_ordering='tf',
+                                                name='global_maxpool3d')
+    new_layer = keras.layers.GlobalMaxPool3D(data_format='channels_last',
+                                             name='global_maxpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.GlobalMaxPooling3D(dim_ordering='th', name='global_maxpool3d')
-    new_layer = keras.layers.GlobalMaxPool3D(data_format='channels_first', name='global_maxpool3d')
+    old_layer = keras.layers.GlobalMaxPooling3D(dim_ordering='th',
+                                                name='global_maxpool3d')
+    new_layer = keras.layers.GlobalMaxPool3D(data_format='channels_first',
+                                             name='global_maxpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.GlobalMaxPooling3D(dim_ordering='default', name='global_maxpool3d')
+    old_layer = keras.layers.GlobalMaxPooling3D(dim_ordering='default',
+                                                name='global_maxpool3d')
     new_layer = keras.layers.GlobalMaxPool3D(name='global_maxpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_global_avgpooling3d_legacy_interface():
-    old_layer = keras.layers.GlobalAveragePooling3D(dim_ordering='tf', name='global_avgpool3d')
-    new_layer = keras.layers.GlobalAvgPool3D(data_format='channels_last', name='global_avgpool3d')
+    old_layer = keras.layers.GlobalAveragePooling3D(dim_ordering='tf',
+                                                    name='global_avgpool3d')
+    new_layer = keras.layers.GlobalAvgPool3D(data_format='channels_last',
+                                             name='global_avgpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.GlobalAveragePooling3D(dim_ordering='th', name='global_avgpool3d')
-    new_layer = keras.layers.GlobalAvgPool3D(data_format='channels_first', name='global_avgpool3d')
+    old_layer = keras.layers.GlobalAveragePooling3D(dim_ordering='th',
+                                                    name='global_avgpool3d')
+    new_layer = keras.layers.GlobalAvgPool3D(data_format='channels_first',
+                                             name='global_avgpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.GlobalAveragePooling3D(dim_ordering='default', name='global_avgpool3d')
+    old_layer = keras.layers.GlobalAveragePooling3D(dim_ordering='default',
+                                                    name='global_avgpool3d')
     new_layer = keras.layers.GlobalAvgPool3D(name='global_avgpool3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_upsampling1d_legacy_interface():
     old_layer = keras.layers.UpSampling1D(length=3, name='us1d')
     new_layer_1 = keras.layers.UpSampling1D(size=3, name='us1d')
@@ -399,14 +448,13 @@ def test_upsampling1d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer_2.get_config())
 
 
-@keras_test
 def test_upsampling2d_legacy_interface():
     old_layer = keras.layers.UpSampling2D((2, 2), dim_ordering='tf', name='us2d')
-    new_layer = keras.layers.UpSampling2D((2, 2), data_format='channels_last', name='us2d')
+    new_layer = keras.layers.UpSampling2D((2, 2), data_format='channels_last',
+                                          name='us2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_upsampling3d_legacy_interface():
     old_layer = keras.layers.UpSampling3D((2, 2, 2),
                                           dim_ordering='tf',
@@ -417,7 +465,6 @@ def test_upsampling3d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_conv2d_legacy_interface():
     old_layer = keras.layers.Convolution2D(5, 3, 3, name='conv')
     new_layer = keras.layers.Conv2D(5, (3, 3), name='conv')
@@ -454,21 +501,23 @@ def test_conv2d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_deconv2d_legacy_interface():
     old_layer = keras.layers.Deconvolution2D(5, 3, 3, (6, 7, 5), name='deconv')
     new_layer = keras.layers.Conv2DTranspose(5, (3, 3), name='deconv')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.Deconvolution2D(5, 3, 3, output_shape=(6, 7, 5), name='deconv')
+    old_layer = keras.layers.Deconvolution2D(5, 3, 3, output_shape=(6, 7, 5),
+                                             name='deconv')
     new_layer = keras.layers.Conv2DTranspose(5, (3, 3), name='deconv')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.Deconvolution2D(5, 3, nb_col=3, output_shape=(6, 7, 5), name='deconv')
+    old_layer = keras.layers.Deconvolution2D(5, 3, nb_col=3, output_shape=(6, 7, 5),
+                                             name='deconv')
     new_layer = keras.layers.Conv2DTranspose(5, (3, 3), name='deconv')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
-    old_layer = keras.layers.Deconvolution2D(5, nb_row=3, nb_col=3, output_shape=(6, 7, 5), name='deconv')
+    old_layer = keras.layers.Deconvolution2D(5, nb_row=3, nb_col=3,
+                                             output_shape=(6, 7, 5), name='deconv')
     new_layer = keras.layers.Conv2DTranspose(5, (3, 3), name='deconv')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
@@ -497,7 +546,6 @@ def test_deconv2d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_conv1d_legacy_interface():
     old_layer = keras.layers.Convolution1D(5,
                                            filter_length=3,
@@ -528,7 +576,6 @@ def test_conv1d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_separable_conv2d_legacy_interface():
     old_layer = keras.layers.SeparableConv2D(5, 3, 3, name='conv')
     new_layer = keras.layers.SeparableConv2D(5, (3, 3), name='conv')
@@ -568,7 +615,6 @@ def test_separable_conv2d_legacy_interface():
     assert old_config == new_config
 
 
-@keras_test
 def test_conv3d_legacy_interface():
     old_layer = keras.layers.Convolution3D(5, 3, 3, 4, name='conv')
     new_layer = keras.layers.Conv3D(5, (3, 3, 4), name='conv')
@@ -616,7 +662,6 @@ def test_conv3d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_convlstm2d_legacy_interface():
     old_layer = keras.layers.ConvLSTM2D(5, 3, 3, name='conv')
     new_layer = keras.layers.ConvLSTM2D(5, (3, 3), name='conv')
@@ -661,7 +706,6 @@ def test_convlstm2d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_batchnorm_legacy_interface():
     old_layer = keras.layers.BatchNormalization(mode=0, name='bn')
     new_layer = keras.layers.BatchNormalization(name='bn')
@@ -677,7 +721,6 @@ def test_batchnorm_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_atrousconv1d_legacy_interface():
     old_layer = keras.layers.AtrousConvolution1D(5, 3,
                                                  init='normal',
@@ -702,7 +745,6 @@ def test_atrousconv1d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_atrousconv2d_legacy_interface():
     old_layer = keras.layers.AtrousConvolution2D(
         5, 3, 3,
@@ -730,7 +772,6 @@ def test_atrousconv2d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_zeropadding2d_legacy_interface():
     old_layer = keras.layers.ZeroPadding2D(padding={'right_pad': 4,
                                                     'bottom_pad': 2,
@@ -744,7 +785,6 @@ def test_zeropadding2d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_zeropadding3d_legacy_interface():
     old_layer = keras.layers.ZeroPadding3D((2, 2, 2),
                                            dim_ordering='tf',
@@ -755,21 +795,18 @@ def test_zeropadding3d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_cropping2d_legacy_interface():
     old_layer = keras.layers.Cropping2D(dim_ordering='tf', name='c2d')
     new_layer = keras.layers.Cropping2D(data_format='channels_last', name='c2d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_cropping3d_legacy_interface():
     old_layer = keras.layers.Cropping3D(dim_ordering='tf', name='c3d')
     new_layer = keras.layers.Cropping3D(data_format='channels_last', name='c3d')
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer.get_config())
 
 
-@keras_test
 def test_generator_methods_interface():
     def train_generator():
         x = np.random.randn(2, 2)
@@ -817,7 +854,6 @@ def test_spatialdropout1d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer_2.get_config())
 
 
-@keras_test
 def test_spatialdropout2d_legacy_interface():
     old_layer = keras.layers.SpatialDropout2D(p=0.5,
                                               dim_ordering='tf',
@@ -832,7 +868,6 @@ def test_spatialdropout2d_legacy_interface():
     assert json.dumps(old_layer.get_config()) == json.dumps(new_layer_2.get_config())
 
 
-@keras_test
 def test_spatialdropout3d_legacy_interface():
     old_layer = keras.layers.SpatialDropout3D(p=0.5,
                                               dim_ordering='tf',
@@ -849,7 +884,6 @@ def test_spatialdropout3d_legacy_interface():
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support get_updates() yet.')
-@keras_test
 def test_optimizer_get_updates_legacy_interface():
     for optimizer_cls in [keras.optimizers.RMSprop,
                           keras.optimizers.SGD,
diff --git a/tests/keras/legacy/layers_test.py b/tests/keras/legacy/layers_test.py
index 766d70c19bf..5c434fdfd2f 100644
--- a/tests/keras/legacy/layers_test.py
+++ b/tests/keras/legacy/layers_test.py
@@ -1,13 +1,11 @@
 import pytest
 
-from keras.utils.test_utils import keras_test
 from keras.utils.test_utils import layer_test
 from keras.legacy import layers as legacy_layers
 from keras import regularizers
 from keras import constraints
 
 
-@keras_test
 def test_highway():
     layer_test(legacy_layers.Highway,
                kwargs={},
@@ -22,7 +20,6 @@ def test_highway():
                input_shape=(3, 2))
 
 
-@keras_test
 def test_maxout_dense():
     layer_test(legacy_layers.MaxoutDense,
                kwargs={'output_dim': 3},
diff --git a/tests/keras/metrics_test.py b/tests/keras/metrics_test.py
index bafe15b28e6..da790422b13 100644
--- a/tests/keras/metrics_test.py
+++ b/tests/keras/metrics_test.py
@@ -1,10 +1,10 @@
 import pytest
 import numpy as np
+from numpy.testing import assert_allclose
 
 import keras
 from keras import metrics
 from keras import backend as K
-from keras.utils.test_utils import keras_test
 
 all_metrics = [
     metrics.binary_accuracy,
@@ -43,7 +43,6 @@
 ]
 
 
-@keras_test
 def test_metrics():
     y_a = K.variable(np.random.random((6, 7)))
     y_b = K.variable(np.random.random((6, 7)))
@@ -65,12 +64,12 @@ def test_sparse_metrics():
         assert K.eval(metric(y_a, y_b)).shape == (6,)
 
 
-@keras_test
 def test_sparse_categorical_accuracy_correctness():
     y_a = K.variable(np.random.randint(0, 7, (6,)), dtype=K.floatx())
     y_b = K.variable(np.random.random((6, 7)), dtype=K.floatx())
     # use one_hot embedding to convert sparse labels to equivalent dense labels
-    y_a_dense_labels = K.cast(K.one_hot(K.cast(y_a, dtype='int32'), num_classes=7), dtype=K.floatx())
+    y_a_dense_labels = K.cast(K.one_hot(K.cast(y_a, dtype='int32'), num_classes=7),
+                              dtype=K.floatx())
     sparse_categorical_acc = metrics.sparse_categorical_accuracy(y_a, y_b)
     categorical_acc = metrics.categorical_accuracy(y_a_dense_labels, y_b)
     assert np.allclose(K.eval(sparse_categorical_acc), K.eval(categorical_acc))
@@ -101,7 +100,6 @@ def test_invalid_get():
 
 @pytest.mark.skipif((K.backend() == 'cntk'),
                     reason='CNTK backend does not support top_k yet')
-@keras_test
 def test_top_k_categorical_accuracy():
     y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
     y_true = K.variable(np.array([[0, 1, 0], [1, 0, 0]]))
@@ -116,13 +114,17 @@ def test_top_k_categorical_accuracy():
     assert failure_result == 0
 
 
-@pytest.mark.skipif(K.backend() == 'mxnet',
-                    reason='MXNet backend does not support `sparse` yet.')
 @pytest.mark.skipif((K.backend() == 'cntk'),
-                    reason='keras cntk backend does not support `top_k` yet')
-def test_sparse_top_k_categorical_accuracy():
-    y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-    y_true = K.variable(np.array([[1], [0]]))
+                    reason='CNTK backend does not support top_k yet')
+@pytest.mark.parametrize('y_pred, y_true', [
+    # Test correctness if the shape of y_true is (num_samples, 1)
+    (np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]), np.array([[1], [0]])),
+    # Test correctness if the shape of y_true is (num_samples,)
+    (np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]), np.array([1, 0])),
+])
+def test_sparse_top_k_categorical_accuracy(y_pred, y_true):
+    y_pred = K.variable(y_pred)
+    y_true = K.variable(y_true)
     success_result = K.eval(
         metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
 
@@ -139,7 +141,6 @@ def test_sparse_top_k_categorical_accuracy():
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support `update` operations yet.')
-@keras_test
 @pytest.mark.parametrize('metrics_mode', ['list', 'dict'])
 def test_stateful_metrics(metrics_mode):
     np.random.seed(1334)
@@ -210,7 +211,8 @@ def __call__(self, y_true, y_pred):
     val_y = np.random.randint(2, size=(val_samples, 1))
 
     # Test fit and evaluate
-    history = model.fit(x, y, validation_data=(val_x, val_y), epochs=1, batch_size=10)
+    history = model.fit(x, y, validation_data=(val_x, val_y),
+                        epochs=1, batch_size=10)
     outs = model.evaluate(x, y, batch_size=10)
     preds = model.predict(x)
 
@@ -223,25 +225,31 @@ def ref_true_pos(y_true, y_pred):
     # Test correctness of the validation metric computation
     val_preds = model.predict(val_x)
     val_outs = model.evaluate(val_x, val_y, batch_size=10)
-    np.testing.assert_allclose(val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
-    np.testing.assert_allclose(val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
+    assert_allclose(val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
+    assert_allclose(val_outs[2], history.history['val_true_positives'][-1],
+                    atol=1e-5)
 
     # Test with generators
     gen = [(np.array([x0]), np.array([y0])) for x0, y0 in zip(x, y)]
     val_gen = [(np.array([x0]), np.array([y0])) for x0, y0 in zip(val_x, val_y)]
     history = model.fit_generator(iter(gen), epochs=1, steps_per_epoch=samples,
-                                  validation_data=iter(val_gen), validation_steps=val_samples)
-    outs = model.evaluate_generator(iter(gen), steps=samples)
-    preds = model.predict_generator(iter(gen), steps=samples)
+                                  validation_data=iter(val_gen),
+                                  validation_steps=val_samples)
+    outs = model.evaluate_generator(iter(gen), steps=samples, workers=0)
+    preds = model.predict_generator(iter(gen), steps=samples, workers=0)
 
     # Test correctness of the metric re ref_true_pos()
-    np.testing.assert_allclose(outs[2], ref_true_pos(y, preds), atol=1e-5)
+    np.testing.assert_allclose(outs[2], ref_true_pos(y, preds),
+                               atol=1e-5)
 
     # Test correctness of the validation metric computation
-    val_preds = model.predict_generator(iter(val_gen), steps=val_samples)
-    val_outs = model.evaluate_generator(iter(val_gen), steps=val_samples)
-    np.testing.assert_allclose(val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
-    np.testing.assert_allclose(val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
+    val_preds = model.predict_generator(iter(val_gen), steps=val_samples, workers=0)
+    val_outs = model.evaluate_generator(iter(val_gen), steps=val_samples, workers=0)
+    np.testing.assert_allclose(val_outs[2], ref_true_pos(val_y, val_preds),
+                               atol=1e-5)
+    np.testing.assert_allclose(val_outs[2],
+                               history.history['val_true_positives'][-1],
+                               atol=1e-5)
 
 
 if __name__ == '__main__':
diff --git a/tests/keras/optimizers_test.py b/tests/keras/optimizers_test.py
index 68b3f5b70fd..fd6242c698e 100644
--- a/tests/keras/optimizers_test.py
+++ b/tests/keras/optimizers_test.py
@@ -7,7 +7,6 @@
 from keras import optimizers, Input
 from keras.models import Sequential, Model
 from keras.layers.core import Dense, Activation, Lambda
-from keras.utils.test_utils import keras_test
 from keras.utils.np_utils import to_categorical
 from keras import backend as K
 
@@ -68,7 +67,6 @@ def _test_optimizer(optimizer, target=0.75):
                     reason='MXNet backend does not support constraints. '
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
-@keras_test
 @pytest.mark.skipif((K.backend() != 'tensorflow'),
                     reason="Only Tensorflow raises a "
                            "ValueError if the gradient is null.")
@@ -80,14 +78,14 @@ def test_no_grad():
     mod = Model(inp, x)
     mod.compile('sgd', 'mse')
     with pytest.raises(ValueError):
-        mod.fit(np.zeros([10, 3]), np.zeros([10, 1], np.float32), batch_size=10, epochs=10)
+        mod.fit(np.zeros([10, 3]), np.zeros([10, 1], np.float32),
+                batch_size=10, epochs=10)
 
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support constraints. '
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
-@keras_test
 def test_sgd():
     sgd = optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
     _test_optimizer(sgd)
@@ -97,7 +95,6 @@ def test_sgd():
                     reason='MXNet backend does not support constraints. '
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
-@keras_test
 def test_rmsprop():
     _test_optimizer(optimizers.RMSprop())
     _test_optimizer(optimizers.RMSprop(decay=1e-3))
@@ -107,7 +104,6 @@ def test_rmsprop():
                     reason='MXNet backend does not support constraints. '
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
-@keras_test
 def test_adagrad():
     _test_optimizer(optimizers.Adagrad())
     _test_optimizer(optimizers.Adagrad(decay=1e-3))
@@ -117,7 +113,6 @@ def test_adagrad():
                     reason='MXNet backend does not support constraints. '
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
-@keras_test
 def test_adadelta():
     _test_optimizer(optimizers.Adadelta(), target=0.6)
     _test_optimizer(optimizers.Adadelta(decay=1e-3), target=0.6)
@@ -127,7 +122,6 @@ def test_adadelta():
                     reason='MXNet backend does not support constraints. '
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
-@keras_test
 def test_adam():
     _test_optimizer(optimizers.Adam())
     _test_optimizer(optimizers.Adam(decay=1e-3))
@@ -137,7 +131,6 @@ def test_adam():
                     reason='MXNet backend does not support constraints. '
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
-@keras_test
 def test_adamax():
     _test_optimizer(optimizers.Adamax())
     _test_optimizer(optimizers.Adamax(decay=1e-3))
@@ -147,7 +140,6 @@ def test_adamax():
                     reason='MXNet backend does not support constraints. '
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
-@keras_test
 def test_nadam():
     _test_optimizer(optimizers.Nadam())
 
@@ -155,7 +147,6 @@ def test_nadam():
 # https://github.com/deep-learning-tools/keras/issues/27
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support Adam_AMSGrad optimizer yet.')
-@keras_test
 def test_adam_amsgrad():
     _test_optimizer(optimizers.Adam(amsgrad=True))
     _test_optimizer(optimizers.Adam(amsgrad=True, decay=1e-3))
@@ -165,7 +156,6 @@ def test_adam_amsgrad():
                     reason='MXNet backend does not support constraints. '
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
-@keras_test
 def test_clipnorm():
     sgd = optimizers.SGD(lr=0.01, momentum=0.9, clipnorm=0.5)
     _test_optimizer(sgd)
@@ -175,13 +165,11 @@ def test_clipnorm():
                     reason='MXNet backend does not support constraints. '
                            'Keyword arguments such as `kernel_constraint` '
                            'and `bias_constraint`')
-@keras_test
 def test_clipvalue():
     sgd = optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=0.5)
     _test_optimizer(sgd)
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() != 'tensorflow'),
                     reason='Requires TensorFlow backend')
 def test_tfoptimizer():
@@ -189,7 +177,8 @@ def test_tfoptimizer():
     from tensorflow import train
     optimizer = optimizers.TFOptimizer(train.AdamOptimizer())
     model = Sequential()
-    model.add(Dense(num_classes, input_shape=(3,), kernel_constraint=constraints.MaxNorm(1)))
+    model.add(Dense(num_classes, input_shape=(3,),
+                    kernel_constraint=constraints.MaxNorm(1)))
     model.compile(loss='mean_squared_error', optimizer=optimizer)
     model.fit(np.random.random((5, 3)), np.random.random((5, num_classes)),
               epochs=1, batch_size=5, verbose=0)
diff --git a/tests/keras/test_callbacks.py b/tests/keras/test_callbacks.py
index 944a15bf0b6..22b7bbd3bd4 100644
--- a/tests/keras/test_callbacks.py
+++ b/tests/keras/test_callbacks.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pytest
+from numpy.testing import assert_allclose
 from csv import reader
 from csv import Sniffer
 import shutil
@@ -12,9 +13,12 @@
 from keras.models import Sequential, Model
 from keras.layers import Input, Dense, Dropout, add, dot, Lambda, Layer
 from keras.layers.convolutional import Conv2D
-from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling1D, GlobalAveragePooling2D
+from keras.layers.pooling import MaxPooling2D
+from keras.layers.pooling import GlobalAveragePooling1D
+from keras.layers.pooling import GlobalAveragePooling2D
 from keras.utils.test_utils import get_test_data
-from keras.utils.test_utils import keras_test
+from keras.utils.generic_utils import to_list
+from keras.utils.generic_utils import unpack_singleton
 from keras import backend as K
 from keras.utils import np_utils
 try:
@@ -31,14 +35,38 @@
 test_samples = 20
 
 
-@keras_test
+def data_generator(x, y, batch_size):
+    x = to_list(x)
+    y = to_list(y)
+    max_batch_index = len(x[0]) // batch_size
+    i = 0
+    while 1:
+        x_batch = [array[i * batch_size: (i + 1) * batch_size] for array in x]
+        x_batch = unpack_singleton(x_batch)
+
+        y_batch = [array[i * batch_size: (i + 1) * batch_size] for array in y]
+        y_batch = unpack_singleton(y_batch)
+        yield x_batch, y_batch
+        i += 1
+        i = i % max_batch_index
+
+
+# Changing the default arguments of get_test_data.
+def get_data_callbacks(num_train=train_samples,
+                       num_test=test_samples,
+                       input_shape=(input_dim,),
+                       classification=True,
+                       num_classes=num_classes):
+    return get_test_data(num_train=num_train,
+                         num_test=num_test,
+                         input_shape=input_shape,
+                         classification=classification,
+                         num_classes=num_classes)
+
+
 def test_TerminateOnNaN():
     np.random.seed(1337)
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
 
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
@@ -59,16 +87,7 @@ def test_TerminateOnNaN():
     assert len(loss) == 1
     assert loss[0] == np.inf or np.isnan(loss[0])
 
-    # case 2 fit_generator
-    def data_generator():
-        max_batch_index = len(X_train) // batch_size
-        i = 0
-        while 1:
-            yield (X_train[i * batch_size: (i + 1) * batch_size],
-                   y_train[i * batch_size: (i + 1) * batch_size])
-            i += 1
-            i = i % max_batch_index
-    history = model.fit_generator(data_generator(),
+    history = model.fit_generator(data_generator(X_train, y_train, batch_size),
                                   len(X_train),
                                   validation_data=(X_test, y_test),
                                   callbacks=cbks,
@@ -78,15 +97,10 @@ def data_generator():
     assert loss[0] == np.inf or np.isnan(loss[0])
 
 
-@keras_test
 def test_stop_training_csv(tmpdir):
     np.random.seed(1337)
     fp = str(tmpdir / 'test.csv')
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
 
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
@@ -104,7 +118,8 @@ def data_generator():
         tot = 0
         while 1:
             if tot > 3 * len(X_train):
-                yield np.ones([batch_size, input_dim]) * np.nan, np.ones([batch_size, num_classes]) * np.nan
+                yield (np.ones([batch_size, input_dim]) * np.nan,
+                       np.ones([batch_size, num_classes]) * np.nan)
             else:
                 yield (X_train[i * batch_size: (i + 1) * batch_size],
                        y_train[i * batch_size: (i + 1) * batch_size])
@@ -130,15 +145,10 @@ def data_generator():
     os.remove(fp)
 
 
-@keras_test
 def test_ModelCheckpoint(tmpdir):
     np.random.seed(1337)
     filepath = str(tmpdir / 'checkpoint.h5')
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
     # case 1
@@ -207,14 +217,9 @@ def test_ModelCheckpoint(tmpdir):
     assert not tmpdir.listdir()
 
 
-@keras_test
 def test_EarlyStopping():
     np.random.seed(1337)
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
     model = Sequential()
@@ -238,7 +243,6 @@ def test_EarlyStopping():
                         validation_data=(X_test, y_test), callbacks=cbks, epochs=20)
 
 
-@keras_test
 def test_EarlyStopping_reuse():
     np.random.seed(1337)
     patience = 3
@@ -261,18 +265,24 @@ def test_EarlyStopping_reuse():
     assert len(hist.epoch) >= patience
 
 
-@keras_test
 def test_EarlyStopping_patience():
     class DummyModel(object):
         def __init__(self):
             self.stop_training = False
 
+        def get_weights(self):
+            return []
+
+        def set_weights(self, weights):
+            pass
+
     early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=2)
     early_stop.model = DummyModel()
 
     losses = [0.0860, 0.1096, 0.1040, 0.1019]
 
-    # Should stop after epoch 3, as the loss has not improved after patience=2 epochs.
+    # Should stop after epoch 3,
+    # as the loss has not improved after patience=2 epochs.
     epochs_trained = 0
     early_stop.on_train_begin()
 
@@ -286,14 +296,20 @@ def __init__(self):
     assert epochs_trained == 3
 
 
-@keras_test
 def test_EarlyStopping_baseline():
     class DummyModel(object):
         def __init__(self):
             self.stop_training = False
 
+        def get_weights(self):
+            return []
+
+        def set_weights(self, weights):
+            pass
+
     def baseline_tester(acc_levels):
-        early_stop = callbacks.EarlyStopping(monitor='val_acc', baseline=0.75, patience=2)
+        early_stop = callbacks.EarlyStopping(monitor='val_acc', baseline=0.75,
+                                             patience=2)
         early_stop.model = DummyModel()
         epochs_trained = 0
         early_stop.on_train_begin()
@@ -315,14 +331,85 @@ def baseline_tester(acc_levels):
     assert baseline_not_met == 2
 
 
-@keras_test
+def test_EarlyStopping_final_weights():
+    class DummyModel(object):
+        def __init__(self):
+            self.stop_training = False
+            self.weights = -1
+
+        def get_weights(self):
+            return self.weights
+
+        def set_weights(self, weights):
+            self.weights = weights
+
+        def set_weight_to_epoch(self, epoch):
+            self.weights = epoch
+
+    early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=2)
+    early_stop.model = DummyModel()
+
+    losses = [0.2, 0.15, 0.1, 0.11, 0.12]
+
+    epochs_trained = 0
+    early_stop.on_train_begin()
+
+    for epoch in range(len(losses)):
+        epochs_trained += 1
+        early_stop.model.set_weight_to_epoch(epoch=epoch)
+        early_stop.on_epoch_end(epoch, logs={'val_loss': losses[epoch]})
+
+        if early_stop.model.stop_training:
+            break
+
+    # The best configuration is in the epoch 2 (loss = 0.1000),
+    # so with patience=2 we need to end up at epoch 4
+    assert early_stop.model.get_weights() == 4
+
+
+def test_EarlyStopping_final_weights_when_restoring_model_weights():
+    class DummyModel(object):
+        def __init__(self):
+            self.stop_training = False
+            self.weights = -1
+
+        def get_weights(self):
+            return self.weights
+
+        def set_weights(self, weights):
+            self.weights = weights
+
+        def set_weight_to_epoch(self, epoch):
+            self.weights = epoch
+
+    early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=2,
+                                         restore_best_weights=True)
+    early_stop.model = DummyModel()
+
+    losses = [0.2, 0.15, 0.1, 0.11, 0.12]
+
+    # The best configuration is in the epoch 2 (loss = 0.1000).
+
+    epochs_trained = 0
+    early_stop.on_train_begin()
+
+    for epoch in range(len(losses)):
+        epochs_trained += 1
+        early_stop.model.set_weight_to_epoch(epoch=epoch)
+        early_stop.on_epoch_end(epoch, logs={'val_loss': losses[epoch]})
+
+        if early_stop.model.stop_training:
+            break
+
+    # The best configuration is in epoch 2 (loss = 0.1000),
+    # and while patience = 2, we're restoring the best weights,
+    # so we end up at the epoch with the best weights, i.e. epoch 2
+    assert early_stop.model.get_weights() == 2
+
+
 def test_LearningRateScheduler():
     np.random.seed(1337)
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
     model = Sequential()
@@ -338,14 +425,9 @@ def test_LearningRateScheduler():
     assert (float(K.get_value(model.optimizer.lr)) - 0.2) < K.epsilon()
 
 
-@keras_test
 def test_ReduceLROnPlateau():
     np.random.seed(1337)
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
 
@@ -363,19 +445,20 @@ def make_model():
     model = make_model()
 
     # This should reduce the LR after the first epoch (due to high epsilon).
-    cbks = [callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, min_delta=10, patience=1, cooldown=5)]
+    cbks = [callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
+                                        min_delta=10, patience=1, cooldown=5)]
     model.fit(X_train, y_train, batch_size=batch_size,
               validation_data=(X_test, y_test), callbacks=cbks, epochs=5, verbose=2)
-    assert np.allclose(float(K.get_value(model.optimizer.lr)), 0.01, atol=K.epsilon())
+    assert_allclose(float(K.get_value(model.optimizer.lr)), 0.01, atol=K.epsilon())
 
     model = make_model()
-    cbks = [callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, min_delta=0, patience=1, cooldown=5)]
+    cbks = [callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
+                                        min_delta=0, patience=1, cooldown=5)]
     model.fit(X_train, y_train, batch_size=batch_size,
               validation_data=(X_test, y_test), callbacks=cbks, epochs=5, verbose=2)
-    assert np.allclose(float(K.get_value(model.optimizer.lr)), 0.1, atol=K.epsilon())
+    assert_allclose(float(K.get_value(model.optimizer.lr)), 0.1, atol=K.epsilon())
 
 
-@keras_test
 def test_ReduceLROnPlateau_patience():
     class DummyOptimizer(object):
         def __init__(self):
@@ -400,7 +483,6 @@ def __init__(self):
     assert all([lr == 1.0 for lr in lrs[:-1]]) and lrs[-1] < 1.0
 
 
-@keras_test
 def test_ReduceLROnPlateau_backwards_compatibility():
     import warnings
     with warnings.catch_warnings(record=True) as ws:
@@ -413,16 +495,11 @@ def test_ReduceLROnPlateau_backwards_compatibility():
     assert reduce_on_plateau.min_delta == 1e-13
 
 
-@keras_test
 def test_CSVLogger(tmpdir):
     np.random.seed(1337)
     filepath = str(tmpdir / 'log.tsv')
     sep = '\t'
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
 
@@ -458,50 +535,32 @@ def make_model():
 
     # case 3, reuse of CSVLogger object
     model.fit(X_train, y_train, batch_size=batch_size,
-              validation_data=(X_test, y_test), callbacks=cbks, epochs=1)
+              validation_data=(X_test, y_test), callbacks=cbks, epochs=2)
 
     import re
     with open(filepath) as csvfile:
-        output = " ".join(csvfile.readlines())
+        list_lines = csvfile.readlines()
+        for line in list_lines:
+            assert line.count(sep) == 4
+        assert len(list_lines) == 5
+        output = " ".join(list_lines)
         assert len(re.findall('epoch', output)) == 1
 
     os.remove(filepath)
     assert not tmpdir.listdir()
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'mxnet'),
                     reason='MXNet backend does not support it yet.')
-def test_TensorBoard(tmpdir):
+@pytest.mark.parametrize('update_freq', ['batch', 'epoch', 9])
+def test_TensorBoard(tmpdir, update_freq):
     np.random.seed(np.random.randint(1, 1e7))
     filepath = str(tmpdir / 'logs')
 
-    (X_train, y_train), (X_test, y_test) = get_test_data(
-        num_train=train_samples,
-        num_test=test_samples,
-        input_shape=(input_dim,),
-        classification=True,
-        num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
 
-    def data_generator(train):
-        if train:
-            max_batch_index = len(X_train) // batch_size
-        else:
-            max_batch_index = len(X_test) // batch_size
-        i = 0
-        while 1:
-            if train:
-                # simulate multi-input/output models
-                yield (X_train[i * batch_size: (i + 1) * batch_size],
-                       y_train[i * batch_size: (i + 1) * batch_size])
-            else:
-                yield (X_test[i * batch_size: (i + 1) * batch_size],
-                       y_test[i * batch_size: (i + 1) * batch_size])
-            i += 1
-            i = i % max_batch_index
-
     class DummyStatefulMetric(Layer):
 
         def __init__(self, name='dummy_stateful_metric', **kwargs):
@@ -532,7 +591,8 @@ def callbacks_factory(histogram_freq, embeddings_freq=1):
                                       embeddings_freq=embeddings_freq,
                                       embeddings_layer_names=['dense_1'],
                                       embeddings_data=X_test,
-                                      batch_size=5)]
+                                      batch_size=5,
+                                      update_freq=update_freq)]
 
     # fit without validation data
     model.fit(X_train, y_train, batch_size=batch_size,
@@ -545,12 +605,14 @@ def callbacks_factory(histogram_freq, embeddings_freq=1):
               callbacks=callbacks_factory(histogram_freq=0), epochs=2)
 
     # fit generator without validation data
-    model.fit_generator(data_generator(True), len(X_train), epochs=2,
+    train_generator = data_generator(X_train, y_train, batch_size)
+    model.fit_generator(train_generator, len(X_train), epochs=2,
                         callbacks=callbacks_factory(histogram_freq=0,
                                                     embeddings_freq=0))
 
     # fit generator with validation data and accuracy
-    model.fit_generator(data_generator(True), len(X_train), epochs=2,
+    train_generator = data_generator(X_train, y_train, batch_size)
+    model.fit_generator(train_generator, len(X_train), epochs=2,
                         validation_data=(X_test, y_test),
                         callbacks=callbacks_factory(histogram_freq=1))
 
@@ -559,39 +621,16 @@ def callbacks_factory(histogram_freq, embeddings_freq=1):
     assert not tmpdir.listdir()
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() != 'tensorflow'),
                     reason='Requires TensorFlow backend')
 def test_TensorBoard_histogram_freq_must_have_validation_data(tmpdir):
     np.random.seed(np.random.randint(1, 1e7))
     filepath = str(tmpdir / 'logs')
 
-    (X_train, y_train), (X_test, y_test) = get_test_data(
-        num_train=train_samples,
-        num_test=test_samples,
-        input_shape=(input_dim,),
-        classification=True,
-        num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
 
-    def data_generator(train):
-        if train:
-            max_batch_index = len(X_train) // batch_size
-        else:
-            max_batch_index = len(X_test) // batch_size
-        i = 0
-        while 1:
-            if train:
-                # simulate multi-input/output models
-                yield (X_train[i * batch_size: (i + 1) * batch_size],
-                       y_train[i * batch_size: (i + 1) * batch_size])
-            else:
-                yield (X_test[i * batch_size: (i + 1) * batch_size],
-                       y_test[i * batch_size: (i + 1) * batch_size])
-            i += 1
-            i = i % max_batch_index
-
     inp = Input((input_dim,))
     hidden = Dense(num_hidden, activation='relu')(inp)
     hidden = Dropout(0.1)(hidden)
@@ -617,61 +656,45 @@ def callbacks_factory(histogram_freq, embeddings_freq=1):
                   callbacks=callbacks_factory(histogram_freq=1), epochs=3)
     assert 'validation_data must be provided' in str(raised_exception.value)
 
+    train_generator = data_generator(X_train, y_train, batch_size)
+    validation_generator = data_generator(X_test, y_test, batch_size)
+
     # fit generator without validation data should raise ValueError if
     # histogram_freq > 0
     with pytest.raises(ValueError) as raised_exception:
-        model.fit_generator(data_generator(True), len(X_train), epochs=2,
+        model.fit_generator(train_generator,
+                            len(X_train), epochs=2,
                             callbacks=callbacks_factory(histogram_freq=1))
     assert 'validation_data must be provided' in str(raised_exception.value)
 
     # fit generator with validation data generator should raise ValueError if
     # histogram_freq > 0
     with pytest.raises(ValueError) as raised_exception:
-        model.fit_generator(data_generator(True), len(X_train), epochs=2,
-                            validation_data=data_generator(False),
+        model.fit_generator(train_generator, len(X_train), epochs=2,
+                            validation_data=validation_generator,
                             validation_steps=1,
                             callbacks=callbacks_factory(histogram_freq=1))
     assert 'validation_data must be provided' in str(raised_exception.value)
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'mxnet'),
                     reason='MXNet backend does not support Lambda yet.')
 def test_TensorBoard_multi_input_output(tmpdir):
     np.random.seed(np.random.randint(1, 1e7))
     filepath = str(tmpdir / 'logs')
 
-    (X_train, y_train), (X_test, y_test) = get_test_data(
-        num_train=train_samples,
-        num_test=test_samples,
-        input_shape=(input_dim, input_dim),
-        classification=True,
-        num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks(
+        input_shape=(input_dim, input_dim))
+
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
 
-    def data_generator(train):
-        if train:
-            max_batch_index = len(X_train) // batch_size
-        else:
-            max_batch_index = len(X_test) // batch_size
-        i = 0
-        while 1:
-            if train:
-                # simulate multi-input/output models
-                yield ([X_train[i * batch_size: (i + 1) * batch_size]] * 2,
-                       [y_train[i * batch_size: (i + 1) * batch_size]] * 2)
-            else:
-                yield ([X_test[i * batch_size: (i + 1) * batch_size]] * 2,
-                       [y_test[i * batch_size: (i + 1) * batch_size]] * 2)
-            i += 1
-            i = i % max_batch_index
-
     inp1 = Input((input_dim, input_dim))
     inp2 = Input((input_dim, input_dim))
     inp_3d = add([inp1, inp2])
     inp_2d = GlobalAveragePooling1D()(inp_3d)
-    inp_pair = Lambda(lambda x: x)([inp_3d, inp_2d])  # test a layer with a list of output tensors
+    # test a layer with a list of output tensors
+    inp_pair = Lambda(lambda x: x)([inp_3d, inp_2d])
     hidden = dot(inp_pair, axes=-1)
     hidden = Dense(num_hidden, activation='relu')(hidden)
     hidden = Dropout(0.1)(hidden)
@@ -702,13 +725,15 @@ def callbacks_factory(histogram_freq, embeddings_freq=1):
               validation_data=([X_test] * 2, [y_test] * 2),
               callbacks=callbacks_factory(histogram_freq=1), epochs=2)
 
+    train_generator = data_generator([X_train] * 2, [y_train] * 2, batch_size)
+
     # fit generator without validation data
-    model.fit_generator(data_generator(True), len(X_train), epochs=2,
+    model.fit_generator(train_generator, len(X_train), epochs=2,
                         callbacks=callbacks_factory(histogram_freq=0,
                                                     embeddings_freq=0))
 
     # fit generator with validation data and accuracy
-    model.fit_generator(data_generator(True), len(X_train), epochs=2,
+    model.fit_generator(train_generator, len(X_train), epochs=2,
                         validation_data=([X_test] * 2, [y_test] * 2),
                         callbacks=callbacks_factory(histogram_freq=1))
 
@@ -717,7 +742,6 @@ def callbacks_factory(histogram_freq, embeddings_freq=1):
     assert not tmpdir.listdir()
 
 
-@keras_test
 @pytest.mark.skipif((K.backend() == 'mxnet'),
                     reason='MXNet backend does not support it yet.')
 def test_TensorBoard_convnet(tmpdir):
@@ -725,11 +749,10 @@ def test_TensorBoard_convnet(tmpdir):
     filepath = str(tmpdir / 'logs')
 
     input_shape = (16, 16, 3)
-    (x_train, y_train), (x_test, y_test) = get_test_data(num_train=500,
-                                                         num_test=200,
-                                                         input_shape=input_shape,
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (x_train, y_train), (x_test, y_test) = get_data_callbacks(
+        num_train=500,
+        num_test=200,
+        input_shape=input_shape)
     y_train = np_utils.to_categorical(y_train)
     y_test = np_utils.to_categorical(y_test)
 
@@ -760,14 +783,40 @@ def test_TensorBoard_convnet(tmpdir):
     assert not tmpdir.listdir()
 
 
-@keras_test
+def test_TensorBoard_display_float_from_logs(tmpdir):
+    filepath = str(tmpdir / 'logs')
+
+    input_shape = (3,)
+    (x_train, y_train), _ = get_data_callbacks(num_train=10,
+                                               num_test=0,
+                                               input_shape=input_shape)
+    y_train = np_utils.to_categorical(y_train)
+
+    model = Sequential([
+        Dense(num_classes, activation='softmax')
+    ])
+    model.compile(loss='categorical_crossentropy',
+                  optimizer='rmsprop')
+
+    class CustomCallback(callbacks.Callback):
+
+        def on_epoch_end(self, epoch, logs=None):
+            logs['test'] = 0.
+
+    tsb = callbacks.TensorBoard(log_dir=filepath,
+                                batch_size=16)
+    cbks = [CustomCallback(), tsb]
+    model.fit(x_train, y_train, epochs=2, batch_size=16,
+              callbacks=cbks,
+              verbose=0)
+    assert os.path.isdir(filepath)
+    shutil.rmtree(filepath)
+    assert not tmpdir.listdir()
+
+
 def test_CallbackValData():
     np.random.seed(1337)
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
     model = Sequential()
@@ -781,24 +830,9 @@ def test_CallbackValData():
     model.fit(X_train, y_train, batch_size=batch_size,
               validation_data=(X_test, y_test), callbacks=[cbk], epochs=1)
 
-    def data_generator(train):
-        if train:
-            max_batch_index = len(X_train) // batch_size
-        else:
-            max_batch_index = len(X_test) // batch_size
-        i = 0
-        while 1:
-            if train:
-                yield (X_train[i * batch_size: (i + 1) * batch_size],
-                       y_train[i * batch_size: (i + 1) * batch_size])
-            else:
-                yield (X_test[i * batch_size: (i + 1) * batch_size],
-                       y_test[i * batch_size: (i + 1) * batch_size])
-            i += 1
-            i = i % max_batch_index
-
     cbk2 = callbacks.LambdaCallback(on_train_end=lambda x: 1)
-    model.fit_generator(data_generator(True), len(X_train), epochs=1,
+    train_generator = data_generator(X_train, y_train, batch_size)
+    model.fit_generator(train_generator, len(X_train), epochs=1,
                         validation_data=(X_test, y_test),
                         callbacks=[cbk2])
 
@@ -809,14 +843,9 @@ def data_generator(train):
     assert cbk.validation_data[2].shape == cbk2.validation_data[2].shape
 
 
-@keras_test
 def test_LambdaCallback():
     np.random.seed(1337)
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
     model = Sequential()
@@ -826,14 +855,16 @@ def test_LambdaCallback():
                   optimizer='sgd',
                   metrics=['accuracy'])
 
-    # Start an arbitrary process that should run during model training and be terminated after training has completed.
+    # Start an arbitrary process that should run during model training and
+    # be terminated after training has completed.
     def f():
         while True:
             pass
 
     p = multiprocessing.Process(target=f)
     p.start()
-    cleanup_callback = callbacks.LambdaCallback(on_train_end=lambda logs: p.terminate())
+    cleanup_callback = callbacks.LambdaCallback(
+        on_train_end=lambda logs: p.terminate())
 
     cbks = [cleanup_callback]
     model.fit(X_train, y_train, batch_size=batch_size,
@@ -842,17 +873,12 @@ def f():
     assert not p.is_alive()
 
 
-@keras_test
 def test_TensorBoard_with_ReduceLROnPlateau(tmpdir):
     import shutil
     np.random.seed(np.random.randint(1, 1e7))
     filepath = str(tmpdir / 'logs')
 
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
 
@@ -880,13 +906,8 @@ def test_TensorBoard_with_ReduceLROnPlateau(tmpdir):
     assert not tmpdir.listdir()
 
 
-@keras_test
 def tests_RemoteMonitor():
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
     model = Sequential()
@@ -902,13 +923,8 @@ def tests_RemoteMonitor():
                   validation_data=(X_test, y_test), callbacks=cbks, epochs=1)
 
 
-@keras_test
 def tests_RemoteMonitorWithJsonPayload():
-    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
-                                                         num_test=test_samples,
-                                                         input_shape=(input_dim,),
-                                                         classification=True,
-                                                         num_classes=num_classes)
+    (X_train, y_train), (X_test, y_test) = get_data_callbacks()
     y_test = np_utils.to_categorical(y_test)
     y_train = np_utils.to_categorical(y_train)
     model = Sequential()
@@ -948,7 +964,6 @@ def mxnet_model_checkpoint_test_helper(monitor, save_best_only, mode, prefix='te
 
 @pytest.mark.skipif((K.backend() != 'mxnet'),
                     reason='Supported for MXNet backend only.')
-@keras_test
 def test_mxnet_model_checkpoint_save_all_auto_mode():
     mxnet_model_checkpoint_test_helper(monitor='val_loss', save_best_only=False, mode='auto')
     assert os.path.isfile('test-symbol.json')
@@ -959,7 +974,6 @@ def test_mxnet_model_checkpoint_save_all_auto_mode():
 
 @pytest.mark.skipif((K.backend() != 'mxnet'),
                     reason='Supported for MXNet backend only.')
-@keras_test
 def test_mxnet_model_checkpoint_save_all_min_mode():
     mxnet_model_checkpoint_test_helper(monitor='val_loss', save_best_only=False, mode='min')
     assert os.path.isfile('test-symbol.json')
@@ -970,7 +984,6 @@ def test_mxnet_model_checkpoint_save_all_min_mode():
 
 @pytest.mark.skipif((K.backend() != 'mxnet'),
                     reason='Supported for MXNet backend only.')
-@keras_test
 def test_mxnet_model_checkpoint_save_all_max_mode():
     mxnet_model_checkpoint_test_helper(monitor='val_acc', save_best_only=False, mode='max')
     assert os.path.isfile('test-symbol.json')
@@ -981,7 +994,6 @@ def test_mxnet_model_checkpoint_save_all_max_mode():
 
 @pytest.mark.skipif((K.backend() != 'mxnet'),
                     reason='Supported for MXNet backend only.')
-@keras_test
 def test_mxnet_model_checkpoint_save_best_max_mode():
     mxnet_model_checkpoint_test_helper(monitor='val_acc', save_best_only=True, mode='max', epochs=2)
     # Since we say save_best_only, we need to have only one model file with test-0000.params and test-symbol.json
@@ -995,7 +1007,6 @@ def test_mxnet_model_checkpoint_save_best_max_mode():
 
 @pytest.mark.skipif((K.backend() != 'mxnet'),
                     reason='Supported for MXNet backend only.')
-@keras_test
 def test_mxnet_model_checkpoint_save_all_auto_mode_multi_epoch():
     mxnet_model_checkpoint_test_helper(monitor='val_acc', save_best_only=False, mode='auto', epochs=2)
     assert os.path.isfile('test-symbol.json')
diff --git a/tests/keras/test_sequential_model.py b/tests/keras/test_sequential_model.py
index 9626cfa8f05..23afeeb0e7c 100644
--- a/tests/keras/test_sequential_model.py
+++ b/tests/keras/test_sequential_model.py
@@ -10,7 +10,7 @@
 from keras.models import Sequential
 from keras.layers import Dense, Activation
 from keras.utils import np_utils
-from keras.utils.test_utils import get_test_data, keras_test
+from keras.utils.test_utils import get_test_data
 from keras.models import model_from_json, model_from_yaml
 from keras import losses
 from keras.engine.training_utils import make_batches
@@ -34,7 +34,6 @@ def in_tmpdir(tmpdir):
     assert not tmpdir.listdir()
 
 
-@keras_test
 def test_sequential_pop():
     model = Sequential()
     model.add(Dense(num_hidden, input_dim=input_dim))
@@ -67,7 +66,6 @@ def _get_test_data():
     return (x_train, y_train), (x_test, y_test)
 
 
-@keras_test
 def test_sequential_fit_generator():
     (x_train, y_train), (x_test, y_test) = _get_test_data()
 
@@ -79,9 +77,11 @@ def data_generator(train):
         i = 0
         while 1:
             if train:
-                yield (x_train[i * batch_size: (i + 1) * batch_size], y_train[i * batch_size: (i + 1) * batch_size])
+                yield (x_train[i * batch_size: (i + 1) * batch_size],
+                       y_train[i * batch_size: (i + 1) * batch_size])
             else:
-                yield (x_test[i * batch_size: (i + 1) * batch_size], y_test[i * batch_size: (i + 1) * batch_size])
+                yield (x_test[i * batch_size: (i + 1) * batch_size],
+                       y_test[i * batch_size: (i + 1) * batch_size])
             i += 1
             i = i % max_batch_index
 
@@ -104,7 +104,6 @@ def data_generator(train):
     model.evaluate(x_train, y_train)
 
 
-@keras_test
 def test_sequential(in_tmpdir):
     (x_train, y_train), (x_test, y_test) = _get_test_data()
 
@@ -126,18 +125,24 @@ def data_generator(x, y, batch_size=50):
     model.add(Activation('softmax'))
     model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
 
-    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
-    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=2, validation_split=0.1)
+    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1,
+              validation_data=(x_test, y_test))
+    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=2,
+              validation_split=0.1)
     model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=0)
-    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, shuffle=False)
+    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1,
+              shuffle=False)
 
     model.train_on_batch(x_train[:32], y_train[:32])
 
     loss = model.evaluate(x_test, y_test)
 
-    prediction = model.predict_generator(data_generator(x_test, y_test), 1, max_queue_size=2, verbose=1)
-    gen_loss = model.evaluate_generator(data_generator(x_test, y_test, 50), 1, max_queue_size=2)
-    pred_loss = K.eval(K.mean(losses.get(model.loss)(K.variable(y_test), K.variable(prediction))))
+    prediction = model.predict_generator(data_generator(x_test, y_test), 1,
+                                         max_queue_size=2, verbose=1)
+    gen_loss = model.evaluate_generator(data_generator(x_test, y_test, 50), 1,
+                                        max_queue_size=2)
+    pred_loss = K.eval(K.mean(losses.get(model.loss)(K.variable(y_test),
+                                                     K.variable(prediction))))
 
     assert(np.isclose(pred_loss, loss))
     assert(np.isclose(gen_loss, loss))
@@ -160,9 +165,11 @@ def data_generator(x, y, batch_size=50):
     nloss = model.evaluate(x_test, y_test, verbose=0)
     assert(loss == nloss)
 
-    # test serialization
+    # Test serialization
     config = model.get_config()
-    Sequential.from_config(config)
+    assert 'name' in config
+    new_model = Sequential.from_config(config)
+    assert new_model.weights  # Model should be built.
 
     model.summary()
     json_str = model.to_json()
@@ -172,7 +179,6 @@ def data_generator(x, y, batch_size=50):
     model_from_yaml(yaml_str)
 
 
-@keras_test
 def test_nested_sequential(in_tmpdir):
     (x_train, y_train), (x_test, y_test) = _get_test_data()
 
@@ -189,10 +195,13 @@ def test_nested_sequential(in_tmpdir):
     model.add(Activation('softmax'))
     model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
 
-    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))
-    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=2, validation_split=0.1)
+    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1,
+              validation_data=(x_test, y_test))
+    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=2,
+              validation_split=0.1)
     model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=0)
-    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, shuffle=False)
+    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1,
+              shuffle=False)
 
     model.train_on_batch(x_train[:32], y_train[:32])
 
@@ -223,7 +232,7 @@ def test_nested_sequential(in_tmpdir):
     nloss = model.evaluate(x_test, y_test, verbose=0)
     assert(loss == nloss)
 
-    # test serialization
+    # Test serialization
     config = model.get_config()
     Sequential.from_config(config)
 
@@ -235,7 +244,6 @@ def test_nested_sequential(in_tmpdir):
     model_from_yaml(yaml_str)
 
 
-@keras_test
 def test_sequential_count_params():
     input_dim = 20
     num_units = 10
@@ -258,7 +266,6 @@ def test_sequential_count_params():
     assert(n == model.count_params())
 
 
-@keras_test
 def test_nested_sequential_trainability():
     input_dim = 20
     num_units = 10
@@ -278,7 +285,6 @@ def test_nested_sequential_trainability():
     assert len(model.trainable_weights) == 4
 
 
-@keras_test
 def test_rebuild_model():
     model = Sequential()
     model.add(Dense(128, input_shape=(784,)))
@@ -292,7 +298,6 @@ def test_rebuild_model():
 # https://github.com/deep-learning-tools/keras/issues/30
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not fully support functional_model yet.')
-@keras_test
 def test_clone_functional_model():
     val_a = np.random.random((10, 4))
     val_b = np.random.random((10, 4))
@@ -339,7 +344,6 @@ def test_clone_functional_model():
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not fully support clone model yet.')
-@keras_test
 def test_clone_sequential_model():
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -376,7 +380,6 @@ def test_clone_sequential_model():
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support update_add yet.')
-@keras_test
 def test_sequential_update_disabling():
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -404,7 +407,6 @@ def test_sequential_update_disabling():
     assert np.abs(np.sum(x1 - x2)) > 1e-5
 
 
-@keras_test
 def test_sequential_deferred_build():
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(3))
@@ -422,14 +424,15 @@ def test_sequential_deferred_build():
     assert len(model.layers) == 2
     assert len(model.weights) == 4
 
+    # Test serialization
     config = model.get_config()
-    new_model = keras.models.Sequential.from_config(config)
+    assert 'name' in config
+    new_model = Sequential.from_config(config)
     assert new_model.built is True
     assert len(new_model.layers) == 2
     assert len(new_model.weights) == 4
 
 
-@keras_test
 def test_nested_sequential_deferred_build():
     inner_model = keras.models.Sequential()
     inner_model.add(keras.layers.Dense(3))
diff --git a/tests/keras/utils/data_utils_test.py b/tests/keras/utils/data_utils_test.py
index b292522f344..324a565c621 100644
--- a/tests/keras/utils/data_utils_test.py
+++ b/tests/keras/utils/data_utils_test.py
@@ -21,7 +21,11 @@
 from keras.utils.data_utils import get_file
 from keras.utils.data_utils import validate_file
 from keras.utils.data_utils import prepare_sliced_sparse_data
+from keras import backend as K
 
+pytestmark = pytest.mark.skipif(
+    K.backend() == 'tensorflow',
+    reason='Temporarily disabled until the use_multiprocessing problem is solved')
 if sys.version_info < (3,):
     def next(x):
         return x.next()
@@ -195,6 +199,17 @@ def test_generator_enqueuer_processes():
     enqueuer.stop()
 
 
+def test_generator_enqueuer_threadsafe():
+    enqueuer = GeneratorEnqueuer(create_generator_from_sequence_pcs(
+        DummySequence([3, 200, 200, 3])), use_multiprocessing=False)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    with pytest.raises(RuntimeError) as e:
+        [next(gen_output) for _ in range(10)]
+    assert 'thread-safe' in str(e.value)
+    enqueuer.stop()
+
+
 def test_generator_enqueuer_fail_threads():
     enqueuer = GeneratorEnqueuer(create_generator_from_sequence_threads(
         FaultSequence()), use_multiprocessing=False)
@@ -258,7 +273,7 @@ def test_ordered_enqueuer_fail_threads():
     enqueuer = OrderedEnqueuer(FaultSequence(), use_multiprocessing=False)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
-    with pytest.raises(StopIteration):
+    with pytest.raises(IndexError):
         next(gen_output)
 
 
@@ -330,7 +345,7 @@ def test_ordered_enqueuer_fail_processes():
     enqueuer = OrderedEnqueuer(FaultSequence(), use_multiprocessing=True)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
-    with pytest.raises(StopIteration):
+    with pytest.raises(IndexError):
         next(gen_output)
 
 
diff --git a/tests/keras/utils/generic_utils_test.py b/tests/keras/utils/generic_utils_test.py
index 96486e1ab03..1a63d6479d1 100644
--- a/tests/keras/utils/generic_utils_test.py
+++ b/tests/keras/utils/generic_utils_test.py
@@ -7,12 +7,10 @@
 from keras.utils.generic_utils import Progbar
 from keras.utils.generic_utils import func_dump
 from keras.utils.generic_utils import func_load
-from keras.utils.test_utils import keras_test
 from keras import activations
 from keras import regularizers
 
 
-@keras_test
 def test_progbar():
     values_s = [None,
                 [['key1', 1], ['key2', 1e-4]],
diff --git a/tests/keras/utils/io_utils_test.py b/tests/keras/utils/io_utils_test.py
index 4da9a547266..b0586f11b34 100644
--- a/tests/keras/utils/io_utils_test.py
+++ b/tests/keras/utils/io_utils_test.py
@@ -1,16 +1,17 @@
 '''Tests for functions in io_utils.py.
 '''
 import os
-import sys
 import pytest
 from keras.models import Sequential
 from keras.layers import Dense
 from keras.utils.io_utils import HDF5Matrix
+from keras.utils.io_utils import h5dict
 from keras.utils.io_utils import ask_to_proceed_with_overwrite
+from numpy.testing import assert_allclose
 import numpy as np
 import six
-import warnings
 import h5py
+import tempfile
 try:
     from unittest.mock import patch
 except:
@@ -44,7 +45,7 @@ def test_io_utils(in_tmpdir):
     '''Tests the HDF5Matrix code using the sample from @jfsantos at
     https://gist.github.com/jfsantos/e2ef822c744357a4ed16ec0c885100a3
     '''
-    h5_path = 'test.h5'
+    _, h5_path = tempfile.mkstemp('.h5')
     create_dataset(h5_path)
 
     # Instantiating HDF5Matrix for the training set,
@@ -133,5 +134,80 @@ def test_ask_to_proceed_with_overwrite():
         assert not ask_to_proceed_with_overwrite('/tmp/not_exists')
 
 
+def test_h5dict_attrs():
+    _, h5_path = tempfile.mkstemp('.h5')
+
+    # test both HDF5 and dict implementations
+    paths = [h5_path, dict()]
+
+    for path in paths:
+        f = h5dict(path, mode='w')
+
+        # str
+        f['x'] = 'abcd'
+
+        # list<bytes>
+        f['y'] = [b'efg', b'hij', b'klmn']
+
+        # ndarray
+        array = np.random.random((4, 5, 512))
+        f['z'] = array
+
+        f.close()
+
+        f = h5dict(path, mode='r')
+
+        assert f['x'] == 'abcd'
+        assert f['y'] == [b'efg', b'hij', b'klmn']
+        assert_allclose(f['z'], array)
+
+        f.close()
+    os.remove(h5_path)
+
+
+def test_h5dict_groups():
+    _, h5_path = tempfile.mkstemp('.h5')
+
+    # test both HDF5 and dict implementations
+    paths = [h5_path, dict()]
+
+    for path in paths:
+        f = h5dict(path, mode='w')
+
+        group1 = f['group1']
+        group2 = group1['group2']
+
+        group2['x'] = 'abcd'
+
+        group3 = group2['group3']
+        group3['y'] = [b'efg', b'hij', b'klmn']
+
+        group4 = group3['group4']
+        array = np.random.random((4, 5, 512))
+        group4['z'] = array
+
+        f.close()
+
+        f = h5dict(path, mode='r')
+
+        assert 'group1' in f
+        group1 = f['group1']
+
+        assert 'group2' in group1
+        group2 = group1['group2']
+        assert group2['x'] == 'abcd'
+
+        assert 'group3' in group2
+        group3 = group2['group3']
+        assert group3['y'] == [b'efg', b'hij', b'klmn']
+
+        assert 'group4' in group3
+        group4 = group3['group4']
+        assert_allclose(group4['z'], array)
+
+        f.close()
+    os.remove(h5_path)
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/utils/layer_utils_test.py b/tests/keras/utils/layer_utils_test.py
index 9f7d8ee53aa..60bcbddfe72 100644
--- a/tests/keras/utils/layer_utils_test.py
+++ b/tests/keras/utils/layer_utils_test.py
@@ -7,13 +7,11 @@
 from keras.layers import Flatten
 from keras.models import Sequential
 from keras.utils import layer_utils
-from keras.utils.test_utils import keras_test
 
 
 @pytest.mark.skipif(K.backend() == 'mxnet', reason='Test assumes kernel in channels_last format always. MXNet backend '
                                                    'has performance optimization that changes kernel from '
                                                    '"channels_first" to "channels_last" based on image_data_format')
-@keras_test
 def test_convert_weights():
     def get_model(shape, data_format):
         model = Sequential()
diff --git a/tests/keras/utils/multi_gpu_test.py b/tests/keras/utils/multi_gpu_test.py
index 1f8f59737d7..bfb3d5a5570 100644
--- a/tests/keras/utils/multi_gpu_test.py
+++ b/tests/keras/utils/multi_gpu_test.py
@@ -11,7 +11,6 @@
 import time
 import tempfile
 import tensorflow as tf
-from keras.utils.test_utils import keras_test
 from keras.preprocessing.image import ImageDataGenerator
 
 
@@ -25,7 +24,6 @@
                                     reason='Requires 8 GPUs.')
 
 
-@keras_test
 def test_multi_gpu_simple_model():
     print('####### test simple model')
     num_samples = 1000
@@ -52,7 +50,6 @@ def test_multi_gpu_simple_model():
     parallel_model.fit(x, y, epochs=epochs)
 
 
-@keras_test
 def test_multi_gpu_multi_io_model():
     print('####### test multi-io model')
     num_samples = 1000
@@ -88,7 +85,6 @@ def test_multi_gpu_multi_io_model():
     parallel_model.fit([a_x, b_x], [a_y, b_y], epochs=epochs)
 
 
-@keras_test
 def test_multi_gpu_invalid_devices():
     input_shape = (1000, 10)
     model = keras.models.Sequential()
@@ -120,7 +116,6 @@ def test_multi_gpu_invalid_devices():
         parallel_model.fit(x, y, epochs=2)
 
 
-@keras_test
 def test_serialization():
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(3,
@@ -272,7 +267,6 @@ def multi_gpu_application_folder_generator_benchmark():
         print('%d gpus training:' % i, total_time)
 
 
-@keras_test
 def test_multi_gpu_with_multi_input_layers():
     inputs = keras.Input((4, 3))
     init_state = keras.Input((3,))
@@ -286,5 +280,25 @@ def test_multi_gpu_with_multi_input_layers():
     parallel_model.train_on_batch(x, y)
 
 
+def test_multi_gpu_with_siamese():
+    input_shape = (3,)
+    nested_model = keras.models.Sequential([
+        keras.layers.Dense(32, input_shape=input_shape),
+        keras.layers.Dense(1)
+    ], name='nested')
+
+    input1 = keras.Input(input_shape)
+    input2 = keras.Input(input_shape)
+    score1 = nested_model(input1)
+    score2 = nested_model(input2)
+    score_sum = keras.layers.Add(name='add')([score1, score2])
+
+    siamese = keras.models.Model(inputs=[input1, input2],
+                                 outputs=[score_sum, score1, score2],
+                                 name='siamese')
+    parallel_siamese = multi_gpu_model(siamese, 2)
+    assert parallel_siamese.output_names == ['add', 'nested_1', 'nested_2']
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/test_documentation.py b/tests/test_documentation.py
index e5d02e109e9..13b6d22c0ad 100644
--- a/tests/test_documentation.py
+++ b/tests/test_documentation.py
@@ -49,8 +49,9 @@ def assert_doc_style(name, member, doc):
     lines = doc.split("\n")
     first_line = lines[0]
     if len(first_line.strip()) == 0:
-        raise ValueError("{} the documentation should be on the first line.".format(name),
-                         member.__module__)
+        raise ValueError(
+            "{} the documentation should be on the first line.".format(name),
+            member.__module__)
     if first_line.strip()[-1] != '.':
         raise ValueError("{} first line should end with a '.'".format(name),
                          member.__module__)
@@ -118,8 +119,10 @@ def assert_args_presence(args, doc, member, name):
     styles = [arg + ":" not in words for arg in args]
     if any(styles):
         raise ValueError(
-            "{} {} are not style properly 'argument': documentation".format(name, list(
-                compress(args, styles))), member.__module__)
+            "{} {} are not style properly 'argument': documentation".format(
+                name,
+                list(compress(args, styles))),
+            member.__module__)
 
     # Check arguments order
     indexes = [words.index(arg + ":") for arg in args]
diff --git a/tests/test_dynamic_trainability.py b/tests/test_dynamic_trainability.py
index 7aa5ead01ac..2d2a9b47c0f 100644
--- a/tests/test_dynamic_trainability.py
+++ b/tests/test_dynamic_trainability.py
@@ -2,12 +2,10 @@
 from __future__ import print_function
 import pytest
 
-from keras.utils.test_utils import keras_test
 from keras.models import Model, Sequential
 from keras.layers import Dense, Input
 
 
-@keras_test
 def test_layer_trainability_switch():
     # with constructor argument, in Sequential
     model = Sequential()
@@ -38,7 +36,6 @@ def test_layer_trainability_switch():
     assert model.trainable_weights == []
 
 
-@keras_test
 def test_model_trainability_switch():
     # a non-trainable model has no trainable weights
     x = Input(shape=(1,))
@@ -54,7 +51,6 @@ def test_model_trainability_switch():
     assert model.trainable_weights == []
 
 
-@keras_test
 def test_nested_model_trainability():
     # a Sequential inside a Model
     inner_model = Sequential()
diff --git a/tests/test_loss_masking.py b/tests/test_loss_masking.py
index 1c54724d842..45b23b1fb7d 100644
--- a/tests/test_loss_masking.py
+++ b/tests/test_loss_masking.py
@@ -4,12 +4,10 @@
 from keras.models import Sequential
 from keras.engine.training_utils import weighted_masked_objective
 from keras.layers import TimeDistributed, Masking, Dense
-from keras.utils.test_utils import keras_test
 from keras import losses
 from keras import backend as K
 
 
-@keras_test
 def test_masking():
     np.random.seed(1337)
     x = np.array([[[1], [1]],
@@ -24,7 +22,6 @@ def test_masking():
     assert loss == 0
 
 
-@keras_test
 def test_loss_masking():
     weighted_loss = weighted_masked_objective(losses.get('mae'))
     shape = (3, 4, 2)
diff --git a/tests/test_loss_weighting.py b/tests/test_loss_weighting.py
index bf6cbbf8f14..d65aac80015 100644
--- a/tests/test_loss_weighting.py
+++ b/tests/test_loss_weighting.py
@@ -8,7 +8,6 @@
 from keras.models import Sequential, Model
 from keras.layers import Dense, Activation, GRU, TimeDistributed, Input
 from keras.utils import np_utils
-from keras.utils.test_utils import keras_test
 from numpy.testing import assert_almost_equal, assert_array_almost_equal
 
 num_classes = 10
@@ -53,7 +52,8 @@ def _get_test_data():
     sample_weight = np.ones((y_train.shape[0])) * standard_weight
     sample_weight[int_y_train == weighted_class] = high_weight
 
-    return (x_train, y_train), (x_test, y_test), (sample_weight, class_weight, test_ids)
+    return ((x_train, y_train), (x_test, y_test),
+            (sample_weight, class_weight, test_ids))
 
 
 def create_sequential_model():
@@ -73,12 +73,12 @@ def create_temporal_sequential_model():
     return model
 
 
-@keras_test
 def test_sequential_class_weights():
     model = create_sequential_model()
     model.compile(loss=loss, optimizer='rmsprop')
 
-    (x_train, y_train), (x_test, y_test), (sample_weight, class_weight, test_ids) = _get_test_data()
+    ((x_train, y_train), (x_test, y_test),
+     (sample_weight, class_weight, test_ids)) = _get_test_data()
 
     model.fit(x_train, y_train, batch_size=batch_size,
               epochs=epochs // 3, verbose=0,
@@ -98,12 +98,12 @@ def test_sequential_class_weights():
     assert(score < standard_score_sequential)
 
 
-@keras_test
 def test_sequential_sample_weights():
     model = create_sequential_model()
     model.compile(loss=loss, optimizer='rmsprop')
 
-    (x_train, y_train), (x_test, y_test), (sample_weight, class_weight, test_ids) = _get_test_data()
+    ((x_train, y_train), (x_test, y_test),
+     (sample_weight, class_weight, test_ids)) = _get_test_data()
 
     model.fit(x_train, y_train, batch_size=batch_size,
               epochs=epochs // 3, verbose=0,
@@ -121,9 +121,9 @@ def test_sequential_sample_weights():
     assert(score < standard_score_sequential)
 
 
-@keras_test
 def test_sequential_temporal_sample_weights():
-    (x_train, y_train), (x_test, y_test), (sample_weight, class_weight, test_ids) = _get_test_data()
+    ((x_train, y_train), (x_test, y_test),
+     (sample_weight, class_weight, test_ids)) = _get_test_data()
 
     temporal_x_train = np.reshape(x_train, (len(x_train), 1, x_train.shape[1]))
     temporal_x_train = np.repeat(temporal_x_train, timesteps, axis=1)
@@ -154,25 +154,28 @@ def test_sequential_temporal_sample_weights():
                          sample_weight=temporal_sample_weight[:32])
     model.test_on_batch(temporal_x_train[:32], temporal_y_train[:32],
                         sample_weight=temporal_sample_weight[:32])
-    score = model.evaluate(temporal_x_test[test_ids], temporal_y_test[test_ids], verbose=0)
+    score = model.evaluate(temporal_x_test[test_ids], temporal_y_test[test_ids],
+                           verbose=0)
     assert(score < standard_score_sequential)
 
 
-@keras_test
 def test_weighted_metrics_with_sample_weight():
     decimal = decimal_precision[K.backend()]
 
     model = create_sequential_model()
-    model.compile(loss=loss, optimizer='rmsprop', metrics=[loss], weighted_metrics=[loss])
+    model.compile(loss=loss, optimizer='rmsprop',
+                  metrics=[loss], weighted_metrics=[loss])
 
-    (x_train, y_train), (x_test, y_test), (sample_weight, class_weight, test_ids) = _get_test_data()
+    ((x_train, y_train), (x_test, y_test),
+     (sample_weight, class_weight, test_ids)) = _get_test_data()
 
     history = model.fit(x_train, y_train, batch_size=batch_size,
                         epochs=epochs // 3, verbose=0,
                         sample_weight=sample_weight)
 
     h = history.history
-    assert_array_almost_equal(h['loss'], h['weighted_' + loss_full_name], decimal=decimal)
+    assert_array_almost_equal(h['loss'], h['weighted_' + loss_full_name],
+                              decimal=decimal)
 
     history = model.fit(x_train, y_train, batch_size=batch_size,
                         epochs=epochs // 3, verbose=0,
@@ -180,7 +183,8 @@ def test_weighted_metrics_with_sample_weight():
                         validation_split=0.1)
 
     h = history.history
-    assert_almost_equal(h['val_loss'], h['val_weighted_' + loss_full_name], decimal=decimal)
+    assert_almost_equal(h['val_loss'], h['val_weighted_' + loss_full_name],
+                        decimal=decimal)
 
     model.train_on_batch(x_train[:32], y_train[:32],
                          sample_weight=sample_weight[:32])
@@ -190,7 +194,8 @@ def test_weighted_metrics_with_sample_weight():
     test_sample_weight = np.ones((y_test.shape[0])) * standard_weight
     test_sample_weight[test_ids] = high_weight
 
-    scores = model.evaluate(x_test, y_test, verbose=0, sample_weight=test_sample_weight)
+    scores = model.evaluate(x_test, y_test, verbose=0,
+                            sample_weight=test_sample_weight)
     loss_score, metric_score, weighted_metric_score = scores
 
     assert loss_score < standard_score_sequential
@@ -198,12 +203,12 @@ def test_weighted_metrics_with_sample_weight():
     assert_almost_equal(loss_score, weighted_metric_score, decimal=decimal)
 
 
-@keras_test
 def test_weighted_metrics_with_no_sample_weight():
     decimal = decimal_precision[K.backend()]
 
     model = create_sequential_model()
-    model.compile(loss=loss, optimizer='rmsprop', metrics=[loss], weighted_metrics=[loss])
+    model.compile(loss=loss, optimizer='rmsprop',
+                  metrics=[loss], weighted_metrics=[loss])
 
     (x_train, y_train), (x_test, y_test), _ = _get_test_data()
 
@@ -212,14 +217,17 @@ def test_weighted_metrics_with_no_sample_weight():
 
     h = history.history
     assert_array_almost_equal(h['loss'], h[loss_full_name], decimal=decimal)
-    assert_array_almost_equal(h['loss'], h['weighted_' + loss_full_name], decimal=decimal)
+    assert_array_almost_equal(h['loss'], h['weighted_' + loss_full_name],
+                              decimal=decimal)
 
     history = model.fit(x_train, y_train, batch_size=batch_size,
                         epochs=epochs // 3, verbose=0, validation_split=0.1)
 
     h = history.history
-    assert_array_almost_equal(h['val_loss'], h['val_' + loss_full_name], decimal=decimal)
-    assert_array_almost_equal(h['val_loss'], h['val_weighted_' + loss_full_name], decimal=decimal)
+    assert_array_almost_equal(h['val_loss'], h['val_' + loss_full_name],
+                              decimal=decimal)
+    assert_array_almost_equal(h['val_loss'], h['val_weighted_' + loss_full_name],
+                              decimal=decimal)
 
     model.train_on_batch(x_train[:32], y_train[:32])
     model.test_on_batch(x_train[:32], y_train[:32])
@@ -231,10 +239,10 @@ def test_weighted_metrics_with_no_sample_weight():
     assert_almost_equal(loss_score, weighted_metric_score, decimal=decimal)
 
 
-@keras_test
 def test_weighted_metrics_with_weighted_accuracy_metric():
     model = create_sequential_model()
-    model.compile(loss=loss, optimizer='rmsprop', metrics=['acc'], weighted_metrics=['acc'])
+    model.compile(loss=loss, optimizer='rmsprop',
+                  metrics=['acc'], weighted_metrics=['acc'])
 
     (x_train, y_train), _, (sample_weight, _, _) = _get_test_data()
 
@@ -245,7 +253,6 @@ def test_weighted_metrics_with_weighted_accuracy_metric():
     assert history.history['acc'] != history.history['weighted_acc']
 
 
-@keras_test
 def test_weighted_metrics_with_multiple_outputs():
     decimal = decimal_precision[K.backend()]
 
@@ -260,7 +267,8 @@ def test_weighted_metrics_with_multiple_outputs():
     weighted_metrics = {'output2': [loss]}
     loss_map = {'output1': loss, 'output2': loss}
 
-    model.compile(loss=loss_map, optimizer='sgd', metrics=metrics, weighted_metrics=weighted_metrics)
+    model.compile(loss=loss_map, optimizer='sgd',
+                  metrics=metrics, weighted_metrics=weighted_metrics)
 
     x = np.array([[1, 1, 1, 1, 1]])
     y = {'output1': np.array([0]), 'output2': np.array([1])}
@@ -274,12 +282,12 @@ def test_weighted_metrics_with_multiple_outputs():
     assert_almost_equal(unweighted_metric * weight, weighted_metric, decimal=decimal)
 
 
-@keras_test
 def test_class_weight_wrong_classes():
     model = create_sequential_model()
     model.compile(loss=loss, optimizer='rmsprop')
 
-    (x_train, y_train), (x_test, y_test), (sample_weight, class_weight, test_ids) = _get_test_data()
+    ((x_train, y_train), (x_test, y_test),
+     (sample_weight, class_weight, test_ids)) = _get_test_data()
 
     del class_weight[1]
     with pytest.raises(ValueError):
diff --git a/tests/test_model_pickling.py b/tests/test_model_pickling.py
new file mode 100644
index 00000000000..47dd702ff54
--- /dev/null
+++ b/tests/test_model_pickling.py
@@ -0,0 +1,170 @@
+import pytest
+import os
+import sys
+import tempfile
+import numpy as np
+from numpy.testing import assert_allclose
+from numpy.testing import assert_raises
+
+from keras import backend as K
+from keras.models import Model, Sequential
+from keras.layers import Dense, Lambda, RepeatVector, TimeDistributed
+from keras.layers import Input
+from keras import optimizers
+from keras import losses
+from keras import metrics
+
+if sys.version_info[0] == 3:
+    import pickle
+else:
+    import cPickle as pickle
+
+
+skipif_no_tf_gpu = pytest.mark.skipif(
+    (K.backend() != 'tensorflow') or
+    (not K.tensorflow_backend._get_available_gpus()),
+    reason='Requires TensorFlow backend and a GPU')
+skipif_mxnet = pytest.mark.skipif(K.backend() == 'mxnet',
+                                  reason='MXNet Backend: does not support pickle yet')
+
+
+@skipif_mxnet
+def test_sequential_model_pickling():
+    model = Sequential()
+    model.add(Dense(2, input_shape=(3,)))
+    model.add(RepeatVector(3))
+    model.add(TimeDistributed(Dense(3)))
+    model.compile(loss=losses.MSE,
+                  optimizer=optimizers.RMSprop(lr=0.0001),
+                  metrics=[metrics.categorical_accuracy],
+                  sample_weight_mode='temporal')
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3, 3))
+    model.train_on_batch(x, y)
+
+    out = model.predict(x)
+
+    state = pickle.dumps(model)
+
+    new_model = pickle.loads(state)
+
+    out2 = new_model.predict(x)
+    assert_allclose(out, out2, atol=1e-05)
+
+    # test that new updates are the same with both models
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3, 3))
+    model.train_on_batch(x, y)
+    new_model.train_on_batch(x, y)
+    out = model.predict(x)
+    out2 = new_model.predict(x)
+    assert_allclose(out, out2, atol=1e-05)
+
+
+@skipif_mxnet
+def test_sequential_model_pickling_2():
+    # test with custom optimizer, loss
+    custom_opt = optimizers.rmsprop
+    custom_loss = losses.mse
+    model = Sequential()
+    model.add(Dense(2, input_shape=(3,)))
+    model.add(Dense(3))
+    model.compile(loss=custom_loss, optimizer=custom_opt(), metrics=['acc'])
+
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3))
+    model.train_on_batch(x, y)
+
+    out = model.predict(x)
+
+    state = pickle.dumps(model)
+    model = pickle.loads(state)
+
+    out2 = model.predict(x)
+    assert_allclose(out, out2, atol=1e-05)
+
+
+@skipif_mxnet
+def test_functional_model_pickling():
+    inputs = Input(shape=(3,))
+    x = Dense(2)(inputs)
+    outputs = Dense(3)(x)
+
+    model = Model(inputs, outputs)
+    model.compile(loss=losses.MSE,
+                  optimizer=optimizers.Adam(),
+                  metrics=[metrics.categorical_accuracy])
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3))
+    model.train_on_batch(x, y)
+
+    out = model.predict(x)
+    state = pickle.dumps(model)
+
+    model = pickle.loads(state)
+
+    out2 = model.predict(x)
+    assert_allclose(out, out2, atol=1e-05)
+
+
+@skipif_mxnet
+def test_pickling_multiple_metrics_outputs():
+    inputs = Input(shape=(5,))
+    x = Dense(5)(inputs)
+    output1 = Dense(1, name='output1')(x)
+    output2 = Dense(1, name='output2')(x)
+
+    model = Model(inputs=inputs, outputs=[output1, output2])
+
+    metrics = {'output1': ['mse', 'binary_accuracy'],
+               'output2': ['mse', 'binary_accuracy']
+               }
+    loss = {'output1': 'mse', 'output2': 'mse'}
+
+    model.compile(loss=loss, optimizer='sgd', metrics=metrics)
+
+    # assure that model is working
+    x = np.array([[1, 1, 1, 1, 1]])
+    out = model.predict(x)
+
+    model = pickle.loads(pickle.dumps(model))
+
+    out2 = model.predict(x)
+    assert_allclose(out, out2, atol=1e-05)
+
+
+@skipif_mxnet
+def test_pickling_without_compilation():
+    """Test pickling model without compiling.
+    """
+    model = Sequential()
+    model.add(Dense(2, input_shape=(3,)))
+    model.add(Dense(3))
+
+    model = pickle.loads(pickle.dumps(model))
+
+
+@skipif_mxnet
+def test_pickling_right_after_compilation():
+    model = Sequential()
+    model.add(Dense(2, input_shape=(3,)))
+    model.add(Dense(3))
+    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+    model._make_train_function()
+
+    model = pickle.loads(pickle.dumps(model))
+
+
+@skipif_mxnet
+def test_pickling_unused_layers_is_ok():
+    a = Input(shape=(256, 512, 6))
+    b = Input(shape=(256, 512, 1))
+    c = Lambda(lambda x: x[:, :, :, :1])(a)
+
+    model = Model(inputs=[a, b], outputs=c)
+
+    model = pickle.loads(pickle.dumps(model))
+
+
+if __name__ == '__main__':
+    pytest.main([__file__])
diff --git a/tests/test_model_saving.py b/tests/test_model_saving.py
index 856c1fbf6c2..f60ab2b200b 100644
--- a/tests/test_model_saving.py
+++ b/tests/test_model_saving.py
@@ -9,25 +9,25 @@
 from keras import backend as K
 from keras.engine.saving import preprocess_weights_for_loading
 from keras.models import Model, Sequential
-from keras.layers import Dense, Lambda, RepeatVector, TimeDistributed, Bidirectional, GRU, LSTM, CuDNNGRU, CuDNNLSTM
+from keras.layers import Dense, Lambda, RepeatVector, TimeDistributed
 from keras.layers import Embedding
+from keras.layers import Bidirectional, GRU, LSTM, CuDNNGRU, CuDNNLSTM
 from keras.layers import Conv2D, Flatten
 from keras.layers import Input, InputLayer
 from keras.initializers import Constant
 from keras import optimizers
 from keras import losses
 from keras import metrics
-from keras.utils.test_utils import keras_test
 from keras.models import save_model, load_model, save_mxnet_model
 from keras import datasets
 
 
 skipif_no_tf_gpu = pytest.mark.skipif(
-    (K.backend() != 'tensorflow') or (not K.tensorflow_backend._get_available_gpus()),
+    (K.backend() != 'tensorflow' or
+     not K.tensorflow_backend._get_available_gpus()),
     reason='Requires TensorFlow backend and a GPU')
 
 
-@keras_test
 def test_sequential_model_saving():
     model = Sequential()
     model.add(Dense(2, input_shape=(3,)))
@@ -62,7 +62,6 @@ def test_sequential_model_saving():
     assert_allclose(out, out2, atol=1e-02)
 
 
-@keras_test
 def test_sequential_model_saving_2():
     # test with custom optimizer, loss
     custom_opt = optimizers.rmsprop
@@ -89,7 +88,6 @@ def test_sequential_model_saving_2():
     assert_allclose(out, out2, atol=1e-05)
 
 
-@keras_test
 def test_functional_model_saving():
     inputs = Input(shape=(3,))
     x = Dense(2)(inputs)
@@ -114,7 +112,6 @@ def test_functional_model_saving():
     assert_allclose(out, out2, atol=1e-05)
 
 
-@keras_test
 def test_model_saving_to_pre_created_h5py_file():
     inputs = Input(shape=(3,))
     x = Dense(2)(inputs)
@@ -144,8 +141,14 @@ def test_model_saving_to_pre_created_h5py_file():
         out2 = loaded_model.predict(x)
     assert_allclose(out, out2, atol=1e-05)
 
+    with h5py.File(fname, mode='r+') as h5file:
+        g = h5file.create_group('model')
+        save_model(model, g)
+        loaded_model = load_model(g)
+        out2 = loaded_model.predict(x)
+    assert_allclose(out, out2, atol=1e-05)
+
 
-@keras_test
 def test_model_saving_to_binary_stream():
     inputs = Input(shape=(3,))
     x = Dense(2)(inputs)
@@ -187,7 +190,6 @@ def test_model_saving_to_binary_stream():
     assert_allclose(out, out2, atol=1e-05)
 
 
-@keras_test
 def test_saving_multiple_metrics_outputs():
     inputs = Input(shape=(5,))
     x = Dense(5)(inputs)
@@ -216,7 +218,6 @@ def test_saving_multiple_metrics_outputs():
     assert_allclose(out, out2, atol=1e-05)
 
 
-@keras_test
 def test_saving_without_compilation():
     """Test saving model without compiling.
     """
@@ -230,7 +231,6 @@ def test_saving_without_compilation():
     os.remove(fname)
 
 
-@keras_test
 def test_saving_right_after_compilation():
     model = Sequential()
     model.add(Dense(2, input_shape=(3,)))
@@ -246,7 +246,6 @@ def test_saving_right_after_compilation():
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not allow saving unused layers yet.')
-@keras_test
 def test_saving_unused_layers_is_ok():
     a = Input(shape=(256, 512, 6))
     b = Input(shape=(256, 512, 1))
@@ -262,7 +261,6 @@ def test_saving_unused_layers_is_ok():
 
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support loading weights and reshape yet.')
-@keras_test
 def test_loading_weights_by_name_and_reshape():
     """
     test loading model weights by name on:
@@ -344,7 +342,6 @@ def test_loading_weights_by_name_and_reshape():
     os.remove(fname)
 
 
-@keras_test
 def test_loading_weights_by_name_2():
     """
     test loading model weights by name on:
@@ -403,7 +400,6 @@ def test_loading_weights_by_name_2():
     assert_allclose(np.zeros_like(jessica[1]), jessica[1])  # biases init to 0
 
 
-@keras_test
 def test_loading_weights_by_name_skip_mismatch():
     """
     test skipping layers while loading model weights by name on:
@@ -456,7 +452,6 @@ def square_fn(x):
     return x * x
 
 
-@keras_test
 def test_saving_lambda_custom_objects():
     inputs = Input(shape=(3,))
     x = Lambda(lambda x: square_fn(x), output_shape=(3,))(inputs)
@@ -484,7 +479,6 @@ def test_saving_lambda_custom_objects():
 # https://github.com/deep-learning-tools/keras/issues/26
 @pytest.mark.skipif(K.backend() == 'mxnet',
                     reason='MXNet backend does not support Keras Variable and NDArray operation yet.')
-@keras_test
 def test_saving_lambda_numpy_array_arguments():
     mean = np.random.random((4, 2, 3))
     std = np.abs(np.random.random((4, 2, 3))) + 1e-5
@@ -504,7 +498,6 @@ def test_saving_lambda_numpy_array_arguments():
     assert_allclose(std, model.layers[1].arguments['std'])
 
 
-@keras_test
 def test_saving_custom_activation_function():
     x = Input(shape=(3,))
     output = Dense(3, activation=K.cos)(x)
@@ -528,7 +521,6 @@ def test_saving_custom_activation_function():
     assert_allclose(out, out2, atol=1e-05)
 
 
-@keras_test
 def test_saving_model_with_long_layer_names():
     # This layer name will make the `layers_name` HDF5 attribute blow
     # out of proportion. Note that it fits into the internal HDF5
@@ -571,7 +563,6 @@ def test_saving_model_with_long_layer_names():
     assert_allclose(out, out2, atol=1e-05)
 
 
-@keras_test
 def test_saving_model_with_long_weights_names():
     x = Input(shape=(2,), name='nested_model_input')
     f = x
@@ -604,8 +595,9 @@ def test_saving_model_with_long_weights_names():
     # Check that the HDF5 files contains chunked array
     # of weight names.
     with h5py.File(fname, 'r') as h5file:
-        n_weight_names_arrays = len([attr for attr in h5file['model_weights']['nested_model'].attrs
-                                     if attr.startswith('weight_names')])
+        attrs = [attr for attr in h5file['model_weights']['nested_model'].attrs
+                 if attr.startswith('weight_names')]
+        n_weight_names_arrays = len(attrs)
 
     os.remove(fname)
 
@@ -616,7 +608,6 @@ def test_saving_model_with_long_weights_names():
     assert_allclose(out, out2, atol=1e-05)
 
 
-@keras_test
 def test_saving_recurrent_layer_with_init_state():
     vector_size = 8
     input_length = 20
@@ -636,7 +627,6 @@ def test_saving_recurrent_layer_with_init_state():
     os.remove(fname)
 
 
-@keras_test
 def test_saving_recurrent_layer_without_bias():
     vector_size = 8
     input_length = 20
@@ -652,12 +642,35 @@ def test_saving_recurrent_layer_without_bias():
     os.remove(fname)
 
 
-@keras_test
+def test_loop_model_saving():
+    model = Sequential()
+    model.add(Dense(2, input_shape=(3,)))
+    model.compile(loss=losses.MSE,
+                  optimizer=optimizers.RMSprop(lr=0.0001),
+                  metrics=[metrics.categorical_accuracy])
+
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 2))
+    _, fname = tempfile.mkstemp('.h5')
+
+    for _ in range(3):
+        model.train_on_batch(x, y)
+        save_model(model, fname, overwrite=True)
+        out = model.predict(x)
+
+    new_model = load_model(fname)
+    os.remove(fname)
+
+    out2 = new_model.predict(x)
+    assert_allclose(out, out2, atol=1e-05)
+
+
 def test_saving_constant_initializer_with_numpy():
-    """Test saving and loading model of constant initializer with numpy ndarray as input.
+    """Test saving and loading model of constant initializer with numpy inputs.
     """
     model = Sequential()
-    model.add(Dense(2, input_shape=(3,), kernel_initializer=Constant(np.ones((3, 2)))))
+    model.add(Dense(2, input_shape=(3,),
+                    kernel_initializer=Constant(np.ones((3, 2)))))
     model.add(Dense(3))
     model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
 
@@ -667,16 +680,22 @@ def test_saving_constant_initializer_with_numpy():
     os.remove(fname)
 
 
-@keras_test
 @pytest.mark.parametrize('implementation', [1, 2], ids=['impl1', 'impl2'])
-@pytest.mark.parametrize('bidirectional', [False, True], ids=['single', 'bidirectional'])
+@pytest.mark.parametrize('bidirectional',
+                         [False, True],
+                         ids=['single', 'bidirectional'])
 @pytest.mark.parametrize('to_cudnn', [False, True], ids=['from_cudnn', 'to_cudnn'])
 @pytest.mark.parametrize('rnn_type', ['LSTM', 'GRU'], ids=['LSTM', 'GRU'])
-@pytest.mark.parametrize('model_nest_level', [1, 2], ids=['model_plain', 'model_nested'])
-@pytest.mark.parametrize('model_type', ['func', 'seq'], ids=['model_func', 'model_seq'])
+@pytest.mark.parametrize('model_nest_level',
+                         [1, 2],
+                         ids=['model_plain', 'model_nested'])
+@pytest.mark.parametrize('model_type',
+                         ['func', 'seq'],
+                         ids=['model_func', 'model_seq'])
 @skipif_no_tf_gpu
-def test_load_weights_between_noncudnn_rnn(rnn_type, to_cudnn, bidirectional, implementation,
-                                           model_nest_level, model_type):
+def test_load_weights_between_noncudnn_rnn(rnn_type, to_cudnn, bidirectional,
+                                           implementation, model_nest_level,
+                                           model_type):
     input_size = 10
     timesteps = 6
     input_shape = (timesteps, input_size)
@@ -707,7 +726,8 @@ def test_load_weights_between_noncudnn_rnn(rnn_type, to_cudnn, bidirectional, im
         cudnn_layer = Bidirectional(cudnn_layer)
 
     model = _make_nested_model(input_shape, layer, model_nest_level, model_type)
-    cudnn_model = _make_nested_model(input_shape, cudnn_layer, model_nest_level, model_type)
+    cudnn_model = _make_nested_model(input_shape, cudnn_layer,
+                                     model_nest_level, model_type)
 
     if to_cudnn:
         _convert_model_weights(model, cudnn_model)
@@ -747,7 +767,6 @@ def _convert_model_weights(source_model, target_model):
     os.remove(fname)
 
 
-@keras_test
 @pytest.mark.parametrize('to_cudnn', [False, True], ids=['from_cudnn', 'to_cudnn'])
 @pytest.mark.parametrize('rnn_type', ['LSTM', 'GRU'], ids=['LSTM', 'GRU'])
 @skipif_no_tf_gpu
@@ -810,7 +829,8 @@ def initialize_weights(layer):
 
     def assert_not_compatible(src, dest, message):
         with pytest.raises(ValueError) as ex:
-            preprocess_weights_for_loading(dest, initialize_weights(src).get_weights())
+            preprocess_weights_for_loading(dest,
+                                           initialize_weights(src).get_weights())
         assert message in ex.value.message
 
     assert_not_compatible(gru(), gru(cudnn=True),
@@ -818,14 +838,15 @@ def assert_not_compatible(src, dest, message):
     assert_not_compatible(gru(cudnn=True), gru(),
                           'CuDNNGRU is not compatible with GRU(reset_after=False)')
     assert_not_compatible(gru(), gru(reset_after=True),
-                          'GRU(reset_after=False) is not compatible with GRU(reset_after=True)')
+                          'GRU(reset_after=False) is not compatible with '
+                          'GRU(reset_after=True)')
     assert_not_compatible(gru(reset_after=True), gru(),
-                          'GRU(reset_after=True) is not compatible with GRU(reset_after=False)')
+                          'GRU(reset_after=True) is not compatible with '
+                          'GRU(reset_after=False)')
 
 
 @pytest.mark.skipif((K.backend() != 'mxnet'),
                     reason='Supported for MXNet backend only.')
-@keras_test
 def test_sequential_lstm_mxnet_model_saving():
     max_features = 1000
     maxlen = 80
@@ -863,7 +884,6 @@ def test_sequential_lstm_mxnet_model_saving():
 
 @pytest.mark.skipif((K.backend() != 'mxnet'),
                     reason='Supported for MXNet backend only.')
-@keras_test
 def test_sequential_mxnet_model_saving():
     model = Sequential()
     model.add(Dense(2, input_shape=(3,)))
@@ -894,7 +914,6 @@ def test_sequential_mxnet_model_saving():
 
 @pytest.mark.skipif((K.backend() != 'mxnet'),
                     reason='Supported for MXNet backend only.')
-@keras_test
 def test_sequential_mxnet_model_saving_no_compile():
     model = Sequential()
     model.add(Dense(2, input_shape=(3,)))
@@ -906,7 +925,6 @@ def test_sequential_mxnet_model_saving_no_compile():
 
 @pytest.mark.skipif((K.backend() != 'mxnet'),
                     reason='Supported for MXNet backend only.')
-@keras_test
 def test_sequential_get_mxnet_model_info():
     model = Sequential()
     model.add(Dense(2, input_shape=(3,)))
@@ -949,7 +967,6 @@ def test_sequential_get_mxnet_model_info():
 
 @pytest.mark.skipif((K.backend() != 'mxnet'),
                     reason='Supported for MXNet backend only.')
-@keras_test
 def test_sequential_get_mxnet_model_info_no_compile():
     model = Sequential()
     model.add(Dense(2, input_shape=(3,)))
@@ -959,7 +976,6 @@ def test_sequential_get_mxnet_model_info_no_compile():
         K.get_mxnet_model_info(model)
 
 
-@keras_test
 def test_functional_model_get_mxnet_model_info():
     inputs = Input(shape=(3,))
     x = Dense(2)(inputs)
@@ -1003,7 +1019,6 @@ def _get_kernel_and_bias(model):
         return w_list, b_list
 
 
-@keras_test
 def test_fine_tune_save_weights():
     def get_model():
         # build a regression model
diff --git a/tests/test_multiprocessing.py b/tests/test_multiprocessing.py
index ab637e1c82b..f22cc40ee3b 100644
--- a/tests/test_multiprocessing.py
+++ b/tests/test_multiprocessing.py
@@ -5,12 +5,16 @@
 import numpy as np
 from keras.models import Sequential
 from keras.layers.core import Dense
-from keras.utils.test_utils import keras_test
 from keras.utils import Sequence
+from keras import backend as K
+
+pytestmark = pytest.mark.skipif(
+    K.backend() == 'tensorflow',
+    reason='Temporarily disabled until the use_multiprocessing problem is solved')
 
 STEPS_PER_EPOCH = 100
 STEPS = 100
-WORKERS = 4
+WORKERS = 4 if K.backend() != 'tensorflow' else 2
 
 
 class DummySequence(Sequence):
@@ -21,6 +25,36 @@ def __len__(self):
         return 10
 
 
+class threadsafe_iter:
+    """Takes an iterator/generator and makes it thread-safe by
+    serializing call to the `next` method of given iterator/generator.
+    """
+
+    def __init__(self, it):
+        self.it = it
+        self.lock = threading.Lock()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return self.next()
+
+    def next(self):
+        with self.lock:
+            return next(self.it)
+
+
+def threadsafe_generator(f):
+    """A decorator that takes a generator function and makes it thread-safe.
+    """
+
+    def g(*a, **kw):
+        return threadsafe_iter(f(*a, **kw))
+
+    return g
+
+
 @pytest.fixture
 def in_tmpdir(tmpdir):
     """Runs a function in a temporary directory.
@@ -32,12 +66,12 @@ def in_tmpdir(tmpdir):
     assert not tmpdir.listdir()
 
 
-@keras_test
 def test_multiprocessing_training():
     arr_data = np.random.randint(0, 256, (50, 2))
     arr_labels = np.random.randint(0, 2, 50)
     arr_weights = np.random.random(50)
 
+    @threadsafe_generator
     def custom_generator(use_weights=False):
         batch_size = 10
         n_samples = 50
@@ -201,6 +235,7 @@ def custom_generator(use_weights=False):
                         use_multiprocessing=False)
 
     # Test invalid use cases
+    @threadsafe_generator
     def invalid_generator():
         while True:
             yield arr_data[:10], arr_data[:10], arr_labels[:10], arr_labels[:10]
@@ -239,12 +274,12 @@ def invalid_generator():
                             use_multiprocessing=False)
 
 
-@keras_test
 def test_multiprocessing_training_from_file(in_tmpdir):
     arr_data = np.random.randint(0, 256, (50, 2))
     arr_labels = np.random.randint(0, 2, 50)
     np.savez('data.npz', **{'data': arr_data, 'labels': arr_labels})
 
+    @threadsafe_generator
     def custom_generator():
 
         batch_size = 10
@@ -359,10 +394,10 @@ def custom_generator():
     os.remove('data.npz')
 
 
-@keras_test
 def test_multiprocessing_predicting():
     arr_data = np.random.randint(0, 256, (50, 2))
 
+    @threadsafe_generator
     def custom_generator():
         batch_size = 10
         n_samples = 50
@@ -447,11 +482,11 @@ def custom_generator():
                             use_multiprocessing=False)
 
 
-@keras_test
 def test_multiprocessing_evaluating():
     arr_data = np.random.randint(0, 256, (50, 2))
     arr_labels = np.random.randint(0, 2, 50)
 
+    @threadsafe_generator
     def custom_generator():
         batch_size = 10
         n_samples = 50
@@ -538,7 +573,6 @@ def custom_generator():
                              use_multiprocessing=False)
 
 
-@keras_test
 def test_multiprocessing_fit_error():
     arr_data = np.random.randint(0, 256, (50, 2))
     arr_labels = np.random.randint(0, 2, 50)
@@ -546,6 +580,7 @@ def test_multiprocessing_fit_error():
     n_samples = 50
     good_batches = 3
 
+    @threadsafe_generator
     def custom_generator(use_weights=False):
         """Raises an exception after a few good batches"""
         for i in range(good_batches):
@@ -570,7 +605,7 @@ def custom_generator(use_weights=False):
     #     exception and does not attempt to run the generator.
     #   - On other platforms, make sure `RuntimeError` exception bubbles up
     if os.name is 'nt':
-        with pytest.raises(ValueError):
+        with pytest.raises(RuntimeError):
             model.fit_generator(custom_generator(),
                                 steps_per_epoch=samples,
                                 validation_steps=None,
@@ -604,7 +639,7 @@ def custom_generator(use_weights=False):
     #     exception and does not attempt to run the generator.
     #   - On other platforms, make sure `RuntimeError` exception bubbles up
     if os.name is 'nt':
-        with pytest.raises(ValueError):
+        with pytest.raises(RuntimeError):
             model.fit_generator(custom_generator(),
                                 steps_per_epoch=samples,
                                 validation_steps=None,
@@ -650,7 +685,6 @@ def custom_generator(use_weights=False):
                             use_multiprocessing=False)
 
 
-@keras_test
 def test_multiprocessing_evaluate_error():
     arr_data = np.random.randint(0, 256, (50, 2))
     arr_labels = np.random.randint(0, 2, 50)
@@ -658,6 +692,7 @@ def test_multiprocessing_evaluate_error():
     n_samples = 50
     good_batches = 3
 
+    @threadsafe_generator
     def custom_generator():
         """Raises an exception after a few good batches"""
         for i in range(good_batches):
@@ -711,7 +746,7 @@ def custom_generator():
     #     exception and does not attempt to run the generator.
     #   - On other platforms, make sure `RuntimeError` exception bubbles up
     if os.name is 'nt':
-        with pytest.raises(ValueError):
+        with pytest.raises(RuntimeError):
             model.evaluate_generator(custom_generator(),
                                      steps=good_batches + 1,
                                      max_queue_size=10,
@@ -752,11 +787,11 @@ def custom_generator():
                                  use_multiprocessing=False)
 
 
-@keras_test
 def test_multiprocessing_predict_error():
     arr_data = np.random.randint(0, 256, (50, 2))
     good_batches = 3
 
+    @threadsafe_generator
     def custom_generator():
         """Raises an exception after a few good batches"""
         batch_size = 10
@@ -781,7 +816,7 @@ def custom_generator():
     #     exception and does not attempt to run the generator.
     #   - On other platforms, make sure `RuntimeError` exception bubbles up
     if os.name is 'nt':
-        with pytest.raises(ValueError):
+        with pytest.raises(StopIteration):
             model.predict_generator(custom_generator(),
                                     steps=good_batches * WORKERS + 1,
                                     max_queue_size=10,
@@ -812,7 +847,7 @@ def custom_generator():
     #     exception and does not attempt to run the generator.
     #   - On other platforms, make sure `RuntimeError` exception bubbles up
     if os.name is 'nt':
-        with pytest.raises(ValueError):
+        with pytest.raises(RuntimeError):
             model.predict_generator(custom_generator(),
                                     steps=good_batches + 1,
                                     max_queue_size=10,
@@ -852,5 +887,6 @@ def custom_generator():
                                 workers=0,
                                 use_multiprocessing=False)
 
+
 if __name__ == '__main__':
     pytest.main([__file__])