diff --git a/BatchNormalization.lua b/BatchNormalization.lua
index 85f9bdb08..ba96cc271 100644
--- a/BatchNormalization.lua
+++ b/BatchNormalization.lua
@@ -29,18 +29,24 @@
 ]]--
 local BN,parent = torch.class('nn.BatchNormalization', 'nn.Module')
 
-function BN:__init(nOutput, eps, momentum)
+function BN:__init(nOutput, eps, momentum, affine)
    parent.__init(self)
    assert(nOutput and type(nOutput) == 'number',
-          'Missing argument #1: dimensionality of input. ' ..
-          'Give 0 for no affine transform')
+          'Missing argument #1: dimensionality of input. ')
+   assert(nOutput ~= 0, 'To set affine=false call BatchNormalization'
+     .. '(nOutput,  eps, momentum, false) ')
+   if affine ~= nil then
+      assert(type(affine) == 'boolean', 'affine has to be true/false')
+      self.affine = affine
+   else
+      self.affine = true
+   end
    self.eps = eps or 1e-5
    self.train = true
    self.momentum = momentum or 0.1
-   self.running_mean = torch.Tensor()
-   self.running_std = torch.Tensor()
+   self.running_mean = torch.zeros(nOutput)
+   self.running_std = torch.ones(nOutput)
 
-   if nOutput > 0 then self.affine = true end
    if self.affine then
       self.weight = torch.Tensor(nOutput)
       self.bias = torch.Tensor(nOutput)
@@ -71,20 +77,12 @@ function BN:updateOutput(input)
    self.output:resizeAs(input)
    self.gradInput:resizeAs(input)
    if self.train == false then
-      assert(self.running_mean:nDimension() ~= 0,
-             'Module never run on training data. First run on some training data before evaluating.')
       self.output:copy(input)
       self.buffer:repeatTensor(self.running_mean, nBatch, 1)
       self.output:add(-1, self.buffer)
       self.buffer:repeatTensor(self.running_std, nBatch, 1)
       self.output:cmul(self.buffer)
    else -- training mode
-      if self.running_mean:nDimension() == 0 then
-         self.running_mean:resize(input:size(2)):zero()
-      end
-      if self.running_std:nDimension() == 0 then
-         self.running_std:resize(input:size(2)):zero()
-      end
       -- calculate mean over mini-batch
       self.buffer:mean(input, 1)                        -- E(x) = expectation of x.
       self.running_mean:mul(1 - self.momentum):add(self.momentum, self.buffer) -- add to running mean
diff --git a/SpatialBatchNormalization.lua b/SpatialBatchNormalization.lua
index 3f09c3f5b..cbc50d310 100644
--- a/SpatialBatchNormalization.lua
+++ b/SpatialBatchNormalization.lua
@@ -30,18 +30,24 @@
 ]]--
 local BN,parent = torch.class('nn.SpatialBatchNormalization', 'nn.Module')
 
-function BN:__init(nFeature, eps, momentum)
+function BN:__init(nFeature, eps, momentum, affine)
    parent.__init(self)
    assert(nFeature and type(nFeature) == 'number',
-          'Missing argument #1: Number of feature planes. ' ..
-          'Give 0 for no affine transform')
+          'Missing argument #1: Number of feature planes. ')
+   assert(nFeature ~= 0, 'To set affine=false call SpatialBatchNormalization'
+     .. '(nFeature,  eps, momentum, false) ')
+   if affine ~=nil then
+      assert(type(affine) == 'boolean', 'affine has to be true/false')
+      self.affine = affine
+   else
+      self.affine = true
+   end
    self.eps = eps or 1e-5
    self.train = true
    self.momentum = momentum or 0.1
 
-   self.running_mean = torch.Tensor()
-   self.running_std = torch.Tensor()
-   if nFeature > 0 then self.affine = true end
+   self.running_mean = torch.zeros(nFeature)
+   self.running_std = torch.ones(nFeature)
    if self.affine then
       self.weight = torch.Tensor(nFeature)
       self.bias = torch.Tensor(nFeature)
@@ -75,20 +81,12 @@ function BN:updateOutput(input)
    self.output:resizeAs(input)
    self.gradInput:resizeAs(input)
    if self.train == false then
-      assert(self.running_mean:nDimension() ~= 0,
-             'Module never run on training data. First run on some training data before evaluating.')
       self.output:copy(input)
       self.buffer:repeatTensor(self.running_mean:view(1, nFeature, 1, 1), nBatch, 1, iH, iW)
       self.output:add(-1, self.buffer)
       self.buffer:repeatTensor(self.running_std:view(1, nFeature, 1, 1), nBatch, 1, iH, iW)
       self.output:cmul(self.buffer)
    else -- training mode
-      if self.running_mean:nDimension() == 0 then
-         self.running_mean:resize(nFeature):zero()
-      end
-      if self.running_std:nDimension() == 0 then
-         self.running_std:resize(nFeature):zero()
-      end
       -- calculate mean over mini-batch, over feature-maps
       local in_folded = input:view(nBatch, nFeature, iH * iW)
       self.buffer:mean(in_folded, 1)
diff --git a/doc/convolution.md b/doc/convolution.md
index d0a02fa93..bb8a5f9e7 100755
--- a/doc/convolution.md
+++ b/doc/convolution.md
@@ -513,10 +513,10 @@ w2=image.display(processed)
 <a name="nn.SpatialBatchNormalization"/>
 ## SpatialBatchNormalization ##
 
-`module` = `nn.SpatialBatchNormalization(N [,eps] [, momentum])`
+`module` = `nn.SpatialBatchNormalization(N [,eps] [, momentum] [,affine])`
  where N = number of input feature maps
-giving N = 0 disables the learnable affine transform.
 eps is a small value added to the standard-deviation to avoid divide-by-zero. Defaults to 1e-5
+`affine` is a boolean. When set to false, the learnable affine transform is disabled.  Defaults to true
 
 Implements Batch Normalization as described in the paper:
    "Batch Normalization: Accelerating Deep Network Training
@@ -548,7 +548,7 @@ A = torch.randn(b, m, h, w)
 C = model.forward(A)  -- C will be of size `b x m x h x w`
 
 -- without learnable parameters
-model = nn.SpatialBatchNormalization(0)
+model = nn.SpatialBatchNormalization(m, nil, nil, false)
 A = torch.randn(b, m, h, w)
 C = model.forward(A)  -- C will be of size `b x m x h x w`
 ```
diff --git a/doc/simple.md b/doc/simple.md
index 4eac36d38..6ef7ed28a 100755
--- a/doc/simple.md
+++ b/doc/simple.md
@@ -909,10 +909,11 @@ C = model.forward({A, B})  -- C will be of size `b x m x n`
 ## BatchNormalization ##
 
 ```lua
-module = nn.BatchNormalization(N [, eps] [, momentum])
+module = nn.BatchNormalization(N [, eps] [, momentum] [,affine])
 ```
-where `N` is the dimensionality of input, giving `N = 0` disables the learnable affine transform.
+where `N` is the dimensionality of input
 `eps` is a small value added to the standard-deviation to avoid divide-by-zero. Defaults to `1e-5`.
+`affine` is a boolean. When set to false, the learnable affine transform is disabled. Defaults to true
 
 During training, this layer keeps a running estimate of its computed mean and std.
 The running sum is kept with a default momentum of 0.1 (unless over-ridden)
@@ -939,7 +940,7 @@ A = torch.randn(b, m)
 C = model.forward(A)  -- C will be of size `b x m`
 
 -- without learnable parameters
-model = nn.BatchNormalization(0)
+model = nn.BatchNormalization(m, nil, nil, false)
 A = torch.randn(b, m)
 C = model.forward(A)  -- C will be of size `b x m`
 ```
diff --git a/test.lua b/test.lua
index e60f4252a..88f6f6568 100644
--- a/test.lua
+++ b/test.lua
@@ -438,6 +438,13 @@ function nntest.Sqrt()
    local err = out:dist(in1:sqrt())
    mytester:assertlt(err, 1e-15, torch.typename(module) .. ' - forward err ')
 
+   -- Test zero inputs; we will avoid a div-by-zero by setting to zero
+   local zin = torch.DoubleTensor(5, 7):zero()
+   module:forward(zin)
+   local zgradout = torch.rand(5, 7)
+   local zgradin = module:backward(zin, zgradout)
+   mytester:assertTensorEq(zgradin, torch.DoubleTensor(5, 7):zero(), 0.000001, "error in sqrt backward singularity")
+
    local ini = math.random(3,5)
    local inj = math.random(3,5)
    local ink = math.random(3,5)
@@ -3662,7 +3669,7 @@ function nntest.BatchNormalization()
    mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
 
    -- batch norm without affine transform
-   module = nn.BatchNormalization(0)
+   module = nn.BatchNormalization(indim, 1e-5, 0.1, false)
 
    local err = jac.testJacobian(module,input)
    mytester:assertlt(err,precision, 'error on state ')
@@ -3716,7 +3723,7 @@ function nntest.SpatialBatchNormalization()
    mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
 
    -- batch norm without affine transform
-   module = nn.SpatialBatchNormalization(0)
+   module = nn.SpatialBatchNormalization(indim, 1e-5, 0.1, false)
 
    local err = jac.testJacobian(module,input)
    mytester:assertlt(err,precision, 'error on state ')