diff --git a/NEWS.md b/NEWS.md
index 9bf97ddb3b..5140435114 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,11 +1,22 @@
 # Flux Release Notes
 
+# v0.14
+
+* The use of Zygote's implicit parameters (with `Flux.params` and global variables) is deprecated in favour of the explicit style.
+  The function `train!` has new methods (accepting the model itself) to handle this.
+
+* Sub-module `Flux.Optimise` has been removed, in favour of using [Optimisers.jl](https://github.com/FluxML/Optimisers.jl) more deeply.
+  The function `train!` now lives in sub-module `Flux.Train`, and has re-written internals.
+
+* One-hot arrays have moved to a new package [OneHotArrays.jl](https://github.com/FluxML/OneHotArrays.jl)
+
 ## v0.13.4
 * Added [`PairwiseFusion` layer](https://github.com/FluxML/Flux.jl/pull/1983)
 
-## v0.13
+## v0.13 (April 2022)
+
 * After a deprecations cycle, the datasets in `Flux.Data` have
-been removed in favour of MLDatasets.jl.
+  been removed in favour of [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl).
 * `params` is not exported anymore since it is a common name and is also exported by Distributions.jl
 * `flatten` is not exported anymore due to clash with Iterators.flatten.
 * Remove Juno.jl progress bar support as it is now obsolete.
@@ -48,7 +59,7 @@ been removed in favour of MLDatasets.jl.
 * CUDA.jl 3.0 support
 * Bug fixes and optimizations.
 
-## v0.12.0
+## v0.12.0 (March 2021)
 
 * Add [identity_init](https://github.com/FluxML/Flux.jl/pull/1524).
 * Add [Orthogonal Matrix initialization](https://github.com/FluxML/Flux.jl/pull/1496) as described in [Exact solutions to the nonlinear dynamics of learning in deep linear neural networks](https://arxiv.org/abs/1312.6120).
@@ -73,7 +84,7 @@ been removed in favour of MLDatasets.jl.
 * Adds the [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser.
 * Other new features and bug fixes (see GitHub releases page)
 
-## v0.11
+## v0.11 (July 2020)
 
 * Moved CUDA compatibility to use [CUDA.jl instead of CuArrays.jl](https://github.com/FluxML/Flux.jl/pull/1204)
 * Add [kaiming initialization](https://arxiv.org/abs/1502.01852) methods: [kaiming_uniform and kaiming_normal](https://github.com/FluxML/Flux.jl/pull/1243)
@@ -101,7 +112,7 @@ keyword argument. The `Dropout` struct *whose behavior is left unchanged) is the
 
 See GitHub's releases.
 
-## v0.10.0
+## v0.10.0 (November 2019)
 
 * The default AD engine has switched from [Tracker to Zygote.jl](https://github.com/FluxML/Flux.jl/pull/669)
   - The dependency on Tracker.jl has been removed.
diff --git a/src/Flux.jl b/src/Flux.jl
index 0cacbd419a..c8ae8153fb 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -11,7 +11,7 @@ import Optimisers: Optimisers, trainable, destructure  # before v0.13, Flux owne
 
 using Zygote, ChainRulesCore
 using Zygote: Params, @adjoint, gradient, pullback, @nograd
-export gradient
+# export gradient  # stop exporting this, to make people say "using Zygote", and make easier to replace
 
 # Pirate error to catch a common mistake. (Internal function `base` because overloading `update!` is more likely to give ambiguities.)
 Optimisers.base(dx::Zygote.Grads) = error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`")
@@ -25,14 +25,15 @@ export Chain, Dense, Maxout, SkipConnection, Parallel, PairwiseFusion,
        fmap, cpu, gpu, f32, f64,
        testmode!, trainmode!
 
-include("optimise/Optimise.jl")
-using .Optimise
-using .Optimise: @epochs
-using .Optimise: skip
-export Descent, Adam, Momentum, Nesterov, RMSProp,
-  AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam,
-  AdamW, RAdam, AdaBelief, InvDecay, ExpDecay,
-  WeightDecay, ClipValue, ClipNorm
+include("train/Train.jl")
+using .Train
+export train!
+# Stop exporting these, since Optimisers.jl exports the same names, 
+# and with this PR, Flux.Adam() is literally a wrapper around Adam().
+# export Descent, Adam, Momentum, Nesterov, RMSProp,
+#   AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam,
+#   AdamW, RAdam, AdaBelief, InvDecay, ExpDecay,
+#   WeightDecay, ClipValue, ClipNorm
 
 using CUDA
 const use_cuda = Ref{Union{Nothing,Bool}}(nothing)
diff --git a/src/deprecations.jl b/src/deprecations.jl
index 6719bd39e2..1769a94170 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -34,10 +34,10 @@ struct Zeros
 end
 Zeros(args...) = Zeros()  # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros())
 
-function Optimise.update!(x::AbstractArray, x̄)
-  Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", :update!)
-  x .-= x̄
-end
+# function Optimise.update!(x::AbstractArray, x̄)
+#   Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", :update!)
+#   x .-= x̄
+# end
 
 function Diagonal(size::Integer...; kw...)
   Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", :Diagonal)
@@ -80,3 +80,6 @@ Base.@deprecate_binding RADAM RAdam
 Base.@deprecate_binding OADAM OAdam
 Base.@deprecate_binding ADAGrad AdaGrad
 Base.@deprecate_binding ADADelta AdaDelta
+
+# What remains from the Optimise sub-module has moved to Train:
+Base.@deprecate_binding Optimise Train
diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
deleted file mode 100644
index e691ce0170..0000000000
--- a/src/optimise/Optimise.jl
+++ /dev/null
@@ -1,15 +0,0 @@
-module Optimise
-
-using LinearAlgebra
-import ArrayInterface
-
-export train!, update!,
-	Descent, Adam, Momentum, Nesterov, RMSProp,
-	AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW,RAdam, OAdam, AdaBelief,
-	InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser,
-	ClipValue, ClipNorm
-
-include("optimisers.jl")
-include("train.jl")
-
-end
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
deleted file mode 100644
index ce72a4b0ce..0000000000
--- a/src/optimise/optimisers.jl
+++ /dev/null
@@ -1,724 +0,0 @@
-using Flux
-using MacroTools: @forward
-
-abstract type AbstractOptimiser end
-
-const EPS = 1e-8
-
-# TODO: should use weak refs
-
-"""
-    Descent(η = 0.1)
-
-Classic gradient descent optimiser with learning rate `η`.
-For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-
-# Examples
-```julia
-opt = Descent()
-
-opt = Descent(0.3)
-
-ps = Flux.params(model)
-
-gs = gradient(ps) do
-    loss(x, y)
-end
-
-Flux.Optimise.update!(opt, ps, gs)
-```
-"""
-mutable struct Descent <: AbstractOptimiser
-  eta::Float64
-end
-
-Descent() = Descent(0.1)
-
-function apply!(o::Descent, x, Δ)
-  Δ .*= o.eta
-end
-
-"""
-    Momentum(η = 0.01, ρ = 0.9)
-
-Gradient descent optimizer with learning rate `η` and momentum `ρ`.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Momentum (`ρ`): Controls the acceleration of gradient descent in the
-                  prominent direction, in effect damping oscillations.
-
-# Examples
-```julia
-opt = Momentum()
-
-opt = Momentum(0.01, 0.99)
-```
-"""
-mutable struct Momentum <: AbstractOptimiser
-  eta::Float64
-  rho::Float64
-  velocity::IdDict
-end
-
-Momentum(η = 0.01, ρ = 0.9) = Momentum(η, ρ, IdDict())
-
-function apply!(o::Momentum, x, Δ)
-  η, ρ = o.eta, o.rho
-  v = get!(() -> zero(x), o.velocity, x)::typeof(x)
-  @. v = ρ * v - η * Δ
-  @. Δ = -v
-end
-
-"""
-    Nesterov(η = 0.001, ρ = 0.9)
-
-Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
-                           prominent direction, in effect damping oscillations.
-
-# Examples
-```julia
-opt = Nesterov()
-
-opt = Nesterov(0.003, 0.95)
-```
-"""
-mutable struct Nesterov <: AbstractOptimiser
-  eta::Float64
-  rho::Float64
-  velocity::IdDict
-end
-
-Nesterov(η = 0.001, ρ = 0.9) = Nesterov(η, ρ, IdDict())
-
-function apply!(o::Nesterov, x, Δ)
-  η, ρ = o.eta, o.rho
-  v = get!(() -> zero(x), o.velocity, x)::typeof(x)
-  d = @. ρ^2 * v - (1+ρ) * η * Δ
-  @. v = ρ*v - η*Δ
-  @. Δ = -d
-end
-
-"""
-    RMSProp(η = 0.001, ρ = 0.9, ϵ = $EPS)
-
-Optimizer using the
-[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
-algorithm. Often a good choice for recurrent networks. Parameters other than learning rate
-generally don't need tuning.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Momentum (`ρ`): Controls the acceleration of gradient descent in the
-                  prominent direction, in effect damping oscillations.
-
-# Examples
-```julia
-opt = RMSProp()
-
-opt = RMSProp(0.002, 0.95)
-```
-"""
-mutable struct RMSProp <: AbstractOptimiser
-  eta::Float64
-  rho::Float64
-  epsilon::Float64
-  acc::IdDict
-end
-RMSProp(η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = EPS) = RMSProp(η, ρ, ϵ, IdDict())
-RMSProp(η::Real, ρ::Real, acc::IdDict) = RMSProp(η, ρ, EPS, acc)
-
-function apply!(o::RMSProp, x, Δ)
-  η, ρ = o.eta, o.rho
-  acc = get!(() -> zero(x), o.acc, x)::typeof(x)
-  @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ)
-  @. Δ *= η / (√acc + o.epsilon)
-end
-
-"""
-    Adam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
-
-[Adam](https://arxiv.org/abs/1412.6980) optimiser.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                   second (β2) momentum estimate.
-
-# Examples
-```julia
-opt = Adam()
-
-opt = Adam(0.001, (0.9, 0.8))
-```
-"""
-mutable struct Adam <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64,Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
-end
-Adam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = Adam(η, β, ϵ, IdDict())
-Adam(η::Real, β::Tuple, state::IdDict) = Adam(η, β, EPS, state)
-
-function apply!(o::Adam, x, Δ)
-  η, β = o.eta, o.beta
-
-  mt, vt, βp = get!(o.state, x) do
-      (zero(x), zero(x), Float64[β[1], β[2]])
-  end :: Tuple{typeof(x),typeof(x),Vector{Float64}}
-
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
-  @. Δ =  mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) * η
-  βp .= βp .* β
-
-  return Δ
-end
-
-"""
-    RAdam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
-
-[Rectified Adam](https://arxiv.org/abs/1908.03265) optimizer.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                   second (β2) momentum estimate.
-
-# Examples
-```julia
-opt = RAdam()
-
-opt = RAdam(0.001, (0.9, 0.8))
-```
-"""
-mutable struct RAdam <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64,Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
-end
-RAdam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = RAdam(η, β, ϵ, IdDict())
-RAdam(η::Real, β::Tuple, state::IdDict) = RAdam(η, β, EPS, state)
-
-function apply!(o::RAdam, x, Δ)
-  η, β = o.eta, o.beta
-  ρ∞ = 2/(1-β[2])-1
-
-  mt, vt, βp, t = get!(o.state, x) do
-      (zero(x), zero(x), Float64[β[1], β[2]], Ref(1))
-  end :: Tuple{typeof(x),typeof(x),Vector{Float64},Base.RefValue{Int}}
-
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
-  ρ = ρ∞ - 2t[] * βp[2] / (1 - βp[2])
-  if ρ > 4
-    r = sqrt((ρ-4)*(ρ-2)*ρ∞/((ρ∞-4)*(ρ∞-2)*ρ))
-    @. Δ =  mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) * η * r
-  else
-    @. Δ =  mt / (1 - βp[1]) * η
-  end
-  βp .= βp .* β
-  t[] += 1
-
-  return Δ
-end
-
-"""
-    AdaMax(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
-
-[AdaMax](https://arxiv.org/abs/1412.6980) is a variant of Adam based on the ∞-norm.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                   second (β2) momentum estimate.
-
-# Examples
-```julia
-opt = AdaMax()
-
-opt = AdaMax(0.001, (0.9, 0.995))
-```
-"""
-mutable struct AdaMax <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64,Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
-end
-AdaMax(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = AdaMax(η, β, ϵ, IdDict())
-AdaMax(η::Real, β::Tuple, state::IdDict) = AdaMax(η, β, EPS, state)
-
-function apply!(o::AdaMax, x, Δ)
-  η, β = o.eta, o.beta
-
-  mt, ut, βp = get!(o.state, x) do
-      (zero(x), zero(x), Float64[β[1], β[2]])
-  end :: Tuple{typeof(x),typeof(x),Vector{Float64}}
-
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. ut = max(β[2] * ut, abs(Δ))
-  @. Δ = (η/(1 - βp[1])) * mt/(ut + o.epsilon)
-  βp .= βp .* β
-
-  return Δ
-end
-
-"""
-    OAdam(η = 0.0001, β::Tuple = (0.5, 0.9), ϵ = $EPS)
-
-[OAdam](https://arxiv.org/abs/1711.00141) (Optimistic Adam)
-is a variant of Adam adding an "optimistic" term suitable for adversarial training.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                   second (β2) momentum estimate.
-
-# Examples
-```julia
-opt = OAdam()
-
-opt = OAdam(0.001, (0.9, 0.995))
-```
-"""
-mutable struct OAdam <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64,Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
-end
-OAdam(η::Real = 0.001, β::Tuple = (0.5, 0.9), ϵ::Real = EPS) = OAdam(η, β, ϵ, IdDict())
-OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state)
-
-function apply!(o::OAdam, x, Δ)
-  η, β = o.eta, o.beta
-
-  mt, vt, Δ_, βp = get!(o.state, x) do
-      (zero(x), zero(x), zero(x), Float64[β[1], β[2]])
-  end :: Tuple{typeof(x),typeof(x),typeof(x),Vector{Float64}}
-
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
-  @. Δ = -Δ_
-  @. Δ_ = η * mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon)
-  @. Δ += 2Δ_
-  βp .= βp .* β
-
-  return Δ
-end
-
-"""
-    AdaGrad(η = 0.1, ϵ = $EPS)
-
-[AdaGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
-parameter specific learning rates based on how frequently it is updated.
-Parameters don't need tuning.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-
-# Examples
-```julia
-opt = AdaGrad()
-
-opt = AdaGrad(0.001)
-```
-"""
-mutable struct AdaGrad <: AbstractOptimiser
-  eta::Float64
-  epsilon::Float64
-  acc::IdDict
-end
-AdaGrad(η::Real = 0.1, ϵ::Real = EPS) = AdaGrad(η, ϵ, IdDict())
-AdaGrad(η::Real, state::IdDict) = AdaGrad(η, EPS, state)
-
-function apply!(o::AdaGrad, x, Δ)
-  η = o.eta
-  acc = get!(() -> fill!(similar(x), o.epsilon), o.acc, x)::typeof(x)
-  @. acc += Δ * conj(Δ)
-  @. Δ *= η / (√acc + o.epsilon)
-end
-
-"""
-    AdaDelta(ρ = 0.9, ϵ = $EPS)
-
-[AdaDelta](https://arxiv.org/abs/1212.5701) is a version of AdaGrad adapting its learning
-rate based on a window of past gradient updates.
-Parameters don't need tuning.
-
-# Parameters
-- Rho (`ρ`): Factor by which the gradient is decayed at each time step.
-
-# Examples
-```julia
-opt = AdaDelta()
-
-opt = AdaDelta(0.89)
-```
-"""
-mutable struct AdaDelta <: AbstractOptimiser
-  rho::Float64
-  epsilon::Float64
-  state::IdDict{Any, Any}
-end
-AdaDelta(ρ::Real = 0.9, ϵ::Real = EPS) = AdaDelta(ρ, ϵ, IdDict())
-AdaDelta(ρ::Real, state::IdDict) = AdaDelta(ρ, EPS, state)
-
-function apply!(o::AdaDelta, x, Δ)
-  ρ = o.rho
-  acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2,typeof(x)}
-  @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ)
-  # DON'T remove epsilon from numerator
-  # or even out of the square roots
-  @. Δ *= √(Δacc + o.epsilon) / √(acc + o.epsilon)
-  @. Δacc = ρ * Δacc + (1 - ρ) * Δ * conj(Δ)
-  return Δ
-end
-
-"""
-    AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
-
-The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the Adam
-optimiser. Parameters don't need tuning.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                   second (β2) momentum estimate.
-
-# Examples
-```julia
-opt = AMSGrad()
-
-opt = AMSGrad(0.001, (0.89, 0.995))
-```
-"""
-mutable struct AMSGrad <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64, Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
-end
-AMSGrad(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AMSGrad(η, β, ϵ, IdDict())
-AMSGrad(η::Real, β::Tuple, state::IdDict) = AMSGrad(η, β, EPS, state)
-
-function apply!(o::AMSGrad, x, Δ)
-  η, β = o.eta, o.beta
-
-  mt, vt, v̂t = get!(o.state, x) do
-    (fill!(similar(x), o.epsilon), fill!(similar(x), o.epsilon), fill!(similar(x), o.epsilon))
-  end :: NTuple{3,typeof(x)}
-
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2
-  @. v̂t = max(v̂t, vt)
-  @. Δ = η * mt / (√v̂t + o.epsilon)
-end
-
-"""
-    NAdam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
-
-[NAdam](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of Adam.
-Parameters don't need tuning.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                   second (β2) momentum estimate.
-
-# Examples
-```julia
-opt = NAdam()
-
-opt = NAdam(0.002, (0.89, 0.995))
-```
-"""
-mutable struct NAdam <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64, Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
-end
-NAdam(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = NAdam(η, β, ϵ, IdDict())
-NAdam(η::Real, β::Tuple, state::IdDict) = NAdam(η, β, EPS, state)
-
-function apply!(o::NAdam, x, Δ)
-  η, β = o.eta, o.beta
-
-  mt, vt, βp = get!(o.state, x) do
-    (zero(x), zero(x), Float64[o.beta[1], o.beta[2]])
-  end :: Tuple{typeof(x),typeof(x),Vector{Float64}}
-  β1p, β2p = βp
-
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
-  @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η
-  βp .= βp .* β
-
-  return Δ
-end
-
-"""
-    AdamW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0)
-
-[AdamW](https://arxiv.org/abs/1711.05101) is a variant of Adam fixing (as in repairing) its
-weight decay regularization.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                   second (β2) momentum estimate.
-- `decay`: Decay applied to weights during optimisation.
-
-# Examples
-```julia
-opt = AdamW()
-
-opt = AdamW(0.001, (0.89, 0.995), 0.1)
-```
-"""
-AdamW(η = 0.001, β = (0.9, 0.999), decay = 0) =
-  Optimiser(Adam(η, β), WeightDecay(decay))
-
-"""
-    AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
-
-The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known
-Adam optimiser.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
-                                   second (β2) momentum estimate.
-
-# Examples
-```julia
-opt = AdaBelief()
-
-opt = AdaBelief(0.001, (0.9, 0.8))
-```
-"""
-mutable struct AdaBelief <: AbstractOptimiser
-  eta::Float64
-  beta::Tuple{Float64,Float64}
-  epsilon::Float64
-  state::IdDict{Any, Any}
-end
-AdaBelief(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AdaBelief(η, β, ϵ, IdDict())
-AdaBelief(η::Real, β::Tuple, state::IdDict) = AdaBelief(η, β, EPS, state)
-
-function apply!(o::AdaBelief, x, Δ)
-  η, β = o.eta, o.beta
-
-  mt, st, βp = get!(o.state, x) do
-      (zero(x), zero(x), Float64[β[1], β[2]])
-  end :: Tuple{typeof(x), typeof(x), Vector{Float64}}
-
-  #= st is a variance and can go to zero. This is in contrast to Adam, which uses the
-  second moment which is usually far enough from zero. This is problematic, since st
-  can be slightly negative due to numerical error, and the square root below will fail.
-  Also, if we want to differentiate through the optimizer, √0 is not differentiable.
-  To protect against this, we add a small number, st -> st + eps2.
-  The original implementation (https://github.com/juntang-zhuang/Adabelief-Optimizer)
-  uses the square of Adam's epsilon, which we do here.
-  See also: https://github.com/juntang-zhuang/Adabelief-Optimizer/issues/61 =#
-  eps2 = o.epsilon^2 # TODO: make epsilon^2 the default in next breaking release
-  
-  @. mt = β[1] * mt + (1 - β[1]) * Δ
-  @. st = β[2] * st + (1 - β[2]) * (Δ - mt) * conj(Δ - mt) + eps2
-  @. Δ =  η * mt / (1 - βp[1]) / (√(st / (1 - βp[2])) + eps2)
-  βp .= βp .* β
-
-  return Δ
-end
-
-
-# Compose optimizers
-
-"""
-    Optimiser(a, b, c...)
-
-Combine several optimisers into one; each optimiser produces a modified gradient
-that will be fed into the next, and this is finally applied to the parameter as
-usual.
-"""
-mutable struct Optimiser <: AbstractOptimiser
-  os::Vector{Any}
-end
-
-Optimiser(opts::AbstractOptimiser...) = Optimiser(Any[opts...])
-
-@forward Optimiser.os Base.getindex, Base.first, Base.last, Base.lastindex, Base.push!, Base.setindex!
-@forward Optimiser.os Base.iterate
-
-Base.getindex(c::Optimiser, i::AbstractArray) = Optimiser(c.os[i]...)
-
-function apply!(o::Optimiser, x, Δ)
-  for opt in o.os
-    Δ = apply!(opt, x, Δ)
-  end
-  return Δ
-end
-
-"""
-    InvDecay(γ = 0.001)
-
-Apply inverse time decay to an optimiser, so that the effective step size at
-iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size.
-The wrapped optimiser's step size is not modified.
-
-See also the [Scheduling Optimisers](@ref) section of the docs
-for more general scheduling techniques.
-
-# Examples
-
-`InvDecay` is typically composed  with other optimizers 
-as the last transformation of the gradient:
-
-```julia
-# Inverse decay of the learning rate
-# with starting value 0.001 and decay coefficient 0.01.
-opt = Optimiser(Adam(1f-3), InvDecay(1f-2))
-```
-"""
-mutable struct InvDecay <: AbstractOptimiser
-  gamma::Float64
-  state::IdDict{Any, Int}
-end
-
-InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any, Int}())
-
-function apply!(o::InvDecay, x, Δ)
-  γ = o.gamma
-  n = get!(o.state, x, 1)
-  Δ .*= 1 / (1 + γ * n)
-  o.state[x] = n + 1
-  return Δ
-end
-
-"""
-    ExpDecay(η = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4, start = 1)
-
-Discount the learning rate `η` by the factor `decay` every `decay_step` steps till
-a minimum of `clip`.
-
-# Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
-                       the weights.
-- `decay`: Factor by which the learning rate is discounted.
-- `decay_step`: Schedule decay operations by setting the number of steps between
-                two decay operations.
-- `clip`: Minimum value of learning rate.
-- 'start': Step at which the decay starts.
-
-
-See also the [Scheduling Optimisers](@ref) section of the docs
-for more general scheduling techniques.
-
-# Examples
-
-`ExpDecay` is typically composed  with other optimizers 
-as the last transformation of the gradient:
-```julia
-opt = Optimiser(Adam(), ExpDecay(1.0))
-```
-Note: you may want to start with `η=1` in `ExpDecay` when combined with other
-optimizers (`Adam` in this case) that have their own learning rate.
-"""
-mutable struct ExpDecay <: AbstractOptimiser
-  eta::Float64
-  decay::Float64
-  step::Int64
-  clip::Float64
-  start::Int64
-  current::IdDict
-end
-
-ExpDecay(opt = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4, start = 0) =
-  ExpDecay(opt, decay, decay_step, clip, start, IdDict())
-
-function apply!(o::ExpDecay, x, Δ)
-  η, s, decay, start = o.eta, o.step, o.decay, o.start
-  n = o.current[x] = get(o.current, x, 0) + 1
-  if n > start && n % s == 0 && count(x -> x > start && x % s == 0, values(o.current)) == 1
-    η = max(η * decay, o.clip)
-    o.eta = η
-  end
-  @. Δ *= η
-end
-
-"""
-    WeightDecay(λ = 0)
-
-Decay weights by ``λ``. 
-Typically composed  with other optimizers as the first transformation to the gradient,
-making it equivalent to adding ``L_2`` regularization 
-with coefficient  ``λ`` to the loss.
-
-# Examples
-
-```julia
-opt = Optimiser(WeightDecay(1f-4), Adam())
-```
-"""
-mutable struct WeightDecay <: AbstractOptimiser
-  wd::Real
-end
-
-WeightDecay() = WeightDecay(0)
-
-function apply!(o::WeightDecay, x, Δ)
-  wd = o.wd
-  @. Δ += wd * x
-end
-
-"""
-    ClipValue(thresh)
-
-Clip gradients when their absolute value exceeds `thresh`.
-"""
-mutable struct ClipValue{T} <: AbstractOptimiser
-    thresh::T
-end
-
-apply!(o::ClipValue, x, Δ) = clamp!(Δ, -o.thresh, o.thresh)
-
-"""
-    ClipNorm(thresh)
-
-Clip gradients when their L2 norm exceeds `thresh`.
-"""
-mutable struct ClipNorm{T} <: AbstractOptimiser
-    thresh::T
-end
-
-function apply!(o::ClipNorm, x, Δ)
-    Δnrm = norm(Δ)
-    if Δnrm > o.thresh
-        rmul!(Δ, o.thresh / Δnrm)
-    end
-    return Δ
-end
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
deleted file mode 100644
index c907938504..0000000000
--- a/src/optimise/train.jl
+++ /dev/null
@@ -1,157 +0,0 @@
-using ProgressLogging: @progress, @withprogress, @logprogress
-import Zygote: Params, gradient
-
-
-"""
-    update!(opt, p, g)
-    update!(opt, ps::Params, gs)
-
-Perform an update step of the parameters `ps` (or the single parameter `p`)
-according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
-
-As a result, the parameters are mutated and the optimizer's internal state may change.
-The gradient could be mutated as well.
-"""
-function update!(opt::AbstractOptimiser, x, x̄)
-  x̄r = ArrayInterface.restructure(x, x̄) # address some cases where Zygote's
-                                          # output are not mutable, see #1510 
-  x .-= apply!(opt, x, x̄r)
-end
-
-function update!(opt::AbstractOptimiser, xs::Params, gs)
-  for x in xs
-    isnothing(gs[x]) && continue
-    update!(opt, x, gs[x])
-  end
-end
-
-# Callback niceties
-call(f, xs...) = f(xs...)
-runall(f) = f
-runall(fs::AbstractVector) = () -> foreach(call, fs)
-
-struct SkipException <: Exception end
-
-"""
-    skip()
-
-Call `Flux.skip()` in a callback to indicate when a callback condition is met.
-This will trigger the train loop to skip the current data point and not update with the calculated gradient.
-
-# Examples
-```julia
-cb = function ()
-  loss() > 1e7 && Flux.skip()
-end
-```
-"""
-function skip()
-  throw(SkipException())
-end
-
-
-struct StopException <: Exception end
-
-"""
-    stop()
-
-Call `Flux.stop()` in a callback to indicate when a callback condition is met.
-This will trigger the train loop to stop and exit.
-
-# Examples
-```julia
-cb = function ()
-  accuracy() > 0.9 && Flux.stop()
-end
-```
-"""
-function stop()
-  throw(StopException())
-end
-
-batchmemaybe(x) = tuple(x)
-batchmemaybe(x::Tuple) = x
-
-"""
-    train!(loss, pars::Params, data, opt::AbstractOptimiser; [cb])
-        
-Uses a `loss` function and training `data` to improve the 
-model's parameters according to a particular optimisation rule `opt`.
-
-For each `d in data`, first the gradient of the `loss` is computed like this:
-```
-    gradient(() -> loss(d...), pars)  # if d isa Tuple
-    gradient(() -> loss(d), pars)     # otherwise
-```
-Here `pars` is produced by calling [`Flux.params`](@ref) on your model.
-(Or just on the layers you want to train, like `train!(loss, params(model[1:end-2]), data, opt)`.)
-This is the "implicit" style of parameter handling.
-
-Then, this gradient is used by optimizer `opt` to update the paramters:
-```
-    update!(opt, pars, grads)
-```
-The optimiser should be from the [Flux.Optimise](@ref) module.
-Different optimisers can be combined using [Flux.Optimise.Optimiser](@ref).
-
-This training loop iterates through `data` once.
-You can use [`@epochs`](@ref) to do this several times, or 
-use for instance `Iterators.repeat` to make a longer `data` iterator.
-
-## Callbacks
-
-[Callbacks](@ref) are given with the keyword argument `cb`.
-For example, this will print "training" every 10 seconds (using [`Flux.throttle`](@ref)):
-```
-    train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10))
-```
-    
-The callback can call [`Flux.stop`](@ref) to interrupt the training loop.
-
-Multiple callbacks can be passed to `cb` as array.
-"""
-function train!(loss, ps::Params, data, opt::AbstractOptimiser; cb = () -> ())
-  cb = runall(cb)
-  itrsz = Base.IteratorSize(typeof(data))
-  n = (itrsz == Base.HasLength()) || (itrsz == Base.HasShape{1}()) ? length(data) : 0
-  @withprogress for (i, d) in enumerate(data)
-    try
-      gs = gradient(ps) do
-        loss(batchmemaybe(d)...)
-      end
-      update!(opt, ps, gs)
-      cb()
-    catch ex
-      if ex isa StopException
-        break
-      elseif ex isa SkipException
-        continue
-      else
-        rethrow(ex)
-      end
-    end
-    @logprogress iszero(n) ? nothing : i / n
-  end
-end
-
-"""
-    @epochs N body
-
-Run `body` `N` times. Mainly useful for quickly doing multiple epochs of
-training in a REPL.
-
-# Examples
-```jldoctest
-julia> Flux.@epochs 2 println("hello")
-[ Info: Epoch 1
-hello
-[ Info: Epoch 2
-hello
-```
-"""
-macro epochs(n, ex)
-  :(@progress for i = 1:$(esc(n))
-      @info "Epoch $i"
-      $(esc(ex))
-    end)
-end
diff --git a/src/train/Train.jl b/src/train/Train.jl
new file mode 100644
index 0000000000..32049b9285
--- /dev/null
+++ b/src/train/Train.jl
@@ -0,0 +1,200 @@
+module Train
+
+using LinearAlgebra
+using Optimisers: Optimisers
+using Functors: fmap
+
+export train!, update!, adjust!, FluxState, @epochs,
+	Descent, Adam, Momentum, Nesterov, RMSProp,
+	AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW, RAdam, OAdam, AdaBelief #,
+  # InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser,
+  # ClipValue, ClipNorm
+
+
+### Mutable state storage, to wrap Optimisers.jl
+  
+"""
+    FluxState(rule, state=missing)
+    
+This is an interface between the all-mutable world Flux.jl likes,
+and the could-be-immutable world that Optimisers.jl inhabits.
+
+`state` can can be either the whole state tree which Optimisers.jl builds,
+or else (for Zygote's implicit mode) an IdDict of such states.
+Once initialised, it cannot change between these two modes.
+"""
+mutable struct FluxState{T<:Optimisers.AbstractRule};
+  rule::T
+  state::Any
+end
+
+function Base.show(io::IO, opt::FluxState)
+  print(io, "FluxState(")
+  show(io, opt.rule)
+  if opt.state isa Missing
+    print(io, ", <uninitialised>)")
+  elseif opt.state isa IdDict
+    n = length(keys(opt.state))
+    print(io, ", <implicit IdDict: $n arrays>))")
+  else
+    rn = Ref(0)
+    fmap(x -> (rn[]+=1; x), opt.state, exclude = (x -> x isa Optimisers.Leaf))
+    print(io, ", <explicit tree: $(rn[]) leaves>)")
+  end
+end
+
+for opt in [
+  :Descent, :Adam, :Momentum, :Nesterov, :RMSProp,
+	:AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :AdamW, :RAdam, :OAdam, :AdaBelief,
+	# :InvDecay, :ExpDecay, :WeightDecay, :stop, :skip, :Optimiser,
+  # :ClipValue, :ClipNorm,
+# TODO check that parameters line up nicely old-vs-new, and include the remaining rules
+]
+  @eval $opt(parameters...; kw...) = FluxState(Optimisers.$opt(parameters...; kw...), missing)
+end
+
+
+### Two styles of gradient, and their `train!` functions
+
+using ProgressLogging: @progress, @withprogress, @logprogress
+using Zygote: Zygote, Params
+
+include("explicit_train.jl.jl")  # new!
+include("implicit_train.jl.jl")  # Params etc, Zygote only
+
+explicit_withgradient(f, args...) = Zygote.withgradient(f, args...)  # can overload this to use e.g. Yota / Diffractor
+
+# using Requires  # Flux doesn't use this right now
+# @init @require Diffractor="9f5e2b26-1114-432f-b630-d3fe2085c51c" begin
+#   @eval function explicit_withgradient(f, args...)
+#     y, back = Diffractor.∂⃖¹(f, args...)
+#     _, grads... = back(Zygote.sensitivity(y))
+#     return (; value = y, gradient = grads)
+#   end
+# end
+
+#=
+
+using Diffractor
+function Flux.Train.explicit_withgradient(f, args...)
+  y, back = Diffractor.∂⃖¹(f, args...)
+  _, grads... = back(one(y))
+  return (; value = y, gradient = grads)
+end
+
+=#
+
+### Misc. related utilities
+
+"""
+    Flux.adjust!(opt::FluxState, η::Real)
+
+Alters the learning rate of the optimiser,
+without resetting its stored momentum state, etc.
+"""
+function adjust!(opt::FluxState, eta::Real)
+  opt.rule = Optimisers.adjust(opt.rule, eta)
+  s = opt.state
+  if s isa missing
+  elseif s isa IdDict
+    for k in keys(s)
+      s[k] = Optimisers.adjust(s[k], eta)
+    end
+  else
+    s = Optimisers.adjust(s, eta)
+  end
+  opt.state = s
+  return opt
+end
+
+"""
+    @epochs N body
+
+Run `body` expression `N` times. Mainly useful for quickly doing
+multiple epochs of training in a REPL.
+
+Functionally equivalent to this loop:
+```
+for _ in 1:N   
+    body
+end
+```
+... but adds progress logging and `@info` messages,
+and returns the result of the last iteration.
+
+# Examples
+```jldoctest
+julia> Flux.@epochs 2 println("hello")
+[ Info: Epoch 1
+hello
+[ Info: Epoch 2
+hello
+```
+"""
+macro epochs(n, ex)
+  @gensym val
+  body = :(for i in 1:$(esc(n))
+    @info "Epoch $i"
+    $(esc(val)) = $(esc(ex))
+  end)
+  loop = Expr(:macrocall, Symbol("@progress"), __source__, body)
+  Expr(:block, :($(esc(val)) = nothing), loop, :($(esc(val))))
+  # TODO make this actualy return the value? Names aren't right.
+#
+#     $loop
+#     # @progress for i in 1:$(esc(n))
+# #         @info "Epoch $i"
+# #         $(esc(val)) = $(esc(ex))
+# #     end
+#     $val  # DOESN"T WORK! Expr(:macrocall, ...) ?
+#   end
+end
+
+end
+
+
+#=
+
+using Flux, Random
+data = [(rand(3,2).*[i,1,20/i], [i i]) for i in 1:50] |> shuffle!;
+
+# This exact code works on Flux@0.13. There, train! returns nothing:
+model2 = Chain(Dense(3 => 7, relu), Dense(7 => 1))
+opt2 = Flux.Adam()
+Flux.train!(Flux.params(model2), data, opt2) do x, y
+  Flux.mse(model2(x), y)
+end
+opt2  # contains an IdDict
+
+# This is the new "explicit" method of Train
+model1 = Chain(Dense(3 => 7, relu), Dense(7 => 1))
+opt1 = Flux.Adam()
+Flux.train!(model1, data, opt1) do m, x, y
+  Flux.mse(m(x), y)
+end |> sum
+opt1  # contains state tree
+
+# This is new 3-arg train!, one step not an iteration over data:
+x1, y1 = data[1]
+Flux.train!(model1, opt1) do m
+  Flux.mse(m(x1), y1)
+end
+
+
+
+
+
+julia> using ProgressLogging
+julia> @macroexpand1 @loop N body
+begin
+  x = nothing
+  @progress for i in 1:N
+    @info "step $i"
+    x = body
+  end
+  x
+end
+
+
+
+=#
\ No newline at end of file
diff --git a/src/train/explicit_train.jl b/src/train/explicit_train.jl
new file mode 100644
index 0000000000..edd31b281e
--- /dev/null
+++ b/src/train/explicit_train.jl
@@ -0,0 +1,118 @@
+"""
+    train!(loss, model, data, opt::FluxState)
+
+Flux 0.14 no longer uses Zygote's implicit parameter dictionary `Flux.params`.
+
+The major change to `train!` is that instead of `loss` being a function which typically accepts
+two arguments (the input `x` and expected output `y` from each element of `data`)
+now it should typically accept three, the first of which is the `model` itself.
+
+For example, with these definitions...
+``` 
+data = [(x1, y1), (x2, y2), (x3, y3)];  # each element must be a tuple (or NamedTuple)
+
+loss(m, x, y) = Flux.crossentropy(m(x), y)  # the model is the first argument
+
+opt = Flux.Adam()  # now returns a FluxState
+```
+...calling `train!(loss, model, data, opt)` runs a loop like this:
+```
+for d in data
+    ∂L∂m = Zygote.gradient(loss, model, d...)[1]
+    # update the model using opt & ∂L∂m
+end
+```
+which evaluates the gradient of `loss(model, x1, y1)` with respect to `model`,
+to know how to update the parameters stored within `model`.
+
+It is often convenient to provide the function `loss` using `do` block syntax,
+instead of defining a named function:
+```
+Flux.train!(model, Iterators.take(Iterators.cycle(data), 10), Flux.Adam()) do m, x, y
+    Flux.crossentropy(m(x), y)  # this does not depend on global variables!
+end
+```
+Here `Iterators.take ∘ Iterators.cycle` uses the same `data` for 10 epochs.
+
+Callback functions are not supported. But see 3-argument `train!` for an
+easy way to construct more complicated training loops. For example, this
+adds printing & an early stop to the above:
+```
+for (i, d) in enumerate(data)  
+    x, y = d
+    ell = Flux.train!(model, opt) do m
+        Flux.crossentropy(m(x), y)
+    end
+    i%10==0 && println("on step \$i, the loss was \$l")  # prints every 10th step
+    ell<0.1 && break  # stops training
+end
+```
+"""
+function train!(loss::Function, model, data, opt::FluxState)
+  _initialise!(opt, model)
+  losses = Float32[]
+  s = opt.state
+  s isa IdDict && error("can't mix explicit & implicit!")
+  for d in data
+    l, (g, _...) = Zygote.withgradient(loss, model, train_ok(d)...)
+    s, model = Optimisers.update!(s, model, g)
+    push!(losses, l)
+    opt.state = s
+  end
+  return losses
+end
+
+train_ok(x::T) where T =  error("""train! expects every d in data be a Tuple or a NamedTuple, got $T
+                                   To allow this type, define `Flux.Optimise.train_ok(x::$T) = (x,)`""")
+train_ok(x::Tuple) = x
+train_ok(x::NamedTuple) = x
+
+function _initialise!(opt::FluxState, model)
+  if opt.state isa Missing
+    opt.state = Optimisers.setup(opt.rule, model)
+    fmap(model, exclude = Optimisers.isnumeric) do x
+      Optimisers.maywrite(x) || error("model must be fully mutable for train! to work, got $(typeof(x))")
+    end
+  end
+  opt
+end
+
+"""
+    train!(loss, model, opt)
+
+While the 4-argument method of `train!` iterates over a dataset,
+calling `gradient` many times, this 3-argument version is for a single datapoint,
+and calls `gradient` just once.
+
+Its expects a function `loss` which takes just one argument, the model.
+For instance:
+```
+opt = Flux.Adam()
+train!(model, opt) do m           # the model is explicitly passed to the function as `m`
+    Flux.crossentropy(m(x1), y1)  # but the data point `(x1, y1)` is closed over.
+end
+```
+This calls `Zygote.withgradient(m -> Flux.crossentropy(m(x1), y1), model)`.
+(The `do` block is another syntax for this anonymous function.)
+Then it updates the parameters contained within `model` according
+to the chosen `opt`imiser.
+Finally it returns the value of the loss function.
+"""
+function train!(loss::Function, model, opt::FluxState)
+  _initialise!(opt, model)
+  s = opt.state
+  s isa IdDict && error()
+  l, (g, _...) = explicit_withgradient(loss, model)
+  opt.state, model = Optimisers.update!(s, model, g)
+  l
+end
+
+function train!(loss::Function, model, data, opt::Optimisers.AbstractRule)
+  _initialise!(opt, model)
+  # fmap(opt.state) do x
+  #   x isa Union{Number, AbstractArray{<:Number}} && @warn "optimiser state will be lost!"
+  #   x
+  # end  # won't work as you need to look inside Leaf for non-nothings.
+  @warn "optimiser state will be lost!"
+  train!(loss, model, data, FluxState(opt))
+end
diff --git a/src/train/implicit_train.jl b/src/train/implicit_train.jl
new file mode 100644
index 0000000000..43c3b75766
--- /dev/null
+++ b/src/train/implicit_train.jl
@@ -0,0 +1,81 @@
+"""
+    train!(loss, pars::Params, data, opt::FluxState)
+    
+Legacy method, mimicking the behaviour of Flux <= 0.13.
+(Note that the implementation is different, using Optimisers.jl internally.)
+
+For each `d in data`, first the gradient of the `loss` is computed like this:
+```
+    gradient(() -> loss(d...), pars)  # if d isa Tuple
+    gradient(() -> loss(d), pars)     # otherwise
+```
+Here `pars` is produced by calling [`Flux.params`](@ref) on your model.
+This is Zygote's "implicit" style of parameter handling.
+
+Then, this gradient is used by optimizer `opt` to update the paramters:
+```
+    update!(opt, pars, grads)
+```
+The `data` is iterated through once in this manner.
+
+Typically `data` contains tuples, like `data = [(x1, y1), (x2, y2), (x3, y3)]`.
+In this case the function might be `loss(x, y) = mse(model(x), y)`, accepting two arguments.
+Notice that it closes over the `model`, which is a global variable.
+"""
+function train!(loss::Function, pars::Params, data, opt::FluxState)
+  Base.depwarn("""`Flux.train!` accepting implicit `Params` is a legacy method in Flux 0.14.
+                  Explicit parameters are now preferred, see `train!(loss, model, data, opt)`""", :train!, force=true)
+  _initialise!(opt, pars)
+  losses = Float32[]
+  for d in data
+    l, grads = Zygote.withgradient(() -> loss(batchmemaybe(d)...), pars)
+    update!(opt, pars, grads)
+    push!(losses, l)
+  end
+  return losses
+end
+
+batchmemaybe(x) = tuple(x)
+batchmemaybe(x::Tuple) = x
+
+"""
+    train!(loss, pars::Params, opt::FluxState)
+
+This 3-arg method is a bit of a hybrid. With no `data` to iterate over,
+it calls `gradient(() -> loss(), pars)` just once, then updates parameters.
+"""
+function train!(loss::Function, pars::Params, opt::FluxState)
+  Base.depwarn("""`Flux.train!` accepting implicit `Params` is a legacy method in Flux 0.14.
+                  Explicit parameters are now preferred, see `train!(loss, model, data, opt)`""", :train!, force=true)
+  _initialise!(opt, pars)
+  l, grads = Zygote.withgradient(() -> loss(), pars)
+  update!(opt, pars, grads)
+  return l
+end
+
+function _initialise!(opt::FluxState, pars::Params)
+  dict = IdDict()
+  for p in pars
+    dict[p] = Optimisers.setup(opt.rule, p)
+  end
+  opt.state = dict
+end
+
+"""
+    Flux.update!(opt::FluxState, ps::Params, gs)
+    
+Legacy method, mimicking the behaviour of Flux <= 0.13.
+"""
+function update!(opt::FluxState, xs::Params, gs)
+  Base.depwarn("Flux.update! is a legacy function", :update!)
+  for x in xs
+    isnothing(gs[x]) && continue
+    update!(opt, x, gs[x])
+  end
+end
+
+function update!(opt::FluxState, x::AbstractArray, dx)
+  opt.state[x], xnew = Optimisers.update!(opt.state[x], x, dx)
+  xnew === x || error("failed to mutate x!")
+  nothing
+end
\ No newline at end of file