From d1cfed73ebccbe90ff8470efc23ba6c14e90b27a Mon Sep 17 00:00:00 2001 From: karthikdk Date: Mon, 13 Jun 2022 20:18:13 +0530 Subject: [PATCH 1/5] replace ADAM with Adam and its variants thereof --- docs/src/models/recurrence.md | 2 +- docs/src/saving.md | 2 +- docs/src/training/optimisers.md | 18 +++--- src/Flux.jl | 6 +- src/optimise/Optimise.jl | 4 +- src/optimise/optimisers.jl | 110 ++++++++++++++++---------------- test/optimise.jl | 10 +-- 7 files changed, 76 insertions(+), 76 deletions(-) diff --git a/docs/src/models/recurrence.md b/docs/src/models/recurrence.md index 3aff38597c..35da5697ae 100644 --- a/docs/src/models/recurrence.md +++ b/docs/src/models/recurrence.md @@ -173,7 +173,7 @@ Flux.reset!(m) [m(x) for x in seq_init] ps = Flux.params(m) -opt= ADAM(1e-3) +opt= Adam(1e-3) Flux.train!(loss, ps, data, opt) ``` diff --git a/docs/src/saving.md b/docs/src/saving.md index 2ec6d94372..80332d4a1d 100644 --- a/docs/src/saving.md +++ b/docs/src/saving.md @@ -135,6 +135,6 @@ You can store the optimiser state alongside the model, to resume training exactly where you left off. BSON is smart enough to [cache values](https://github.com/JuliaIO/BSON.jl/blob/v0.3.4/src/write.jl#L71) and insert links when saving, but only if it knows everything to be saved up front. Thus models and optimizers must be saved together to have the latter work after restoring. ```julia -opt = ADAM() +opt = Adam() @save "model-$(now()).bson" model opt ``` diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md index e1fd1e9894..9455047836 100644 --- a/docs/src/training/optimisers.md +++ b/docs/src/training/optimisers.md @@ -39,7 +39,7 @@ for p in (W, b) end ``` -An optimiser `update!` accepts a parameter and a gradient, and updates the parameter according to the chosen rule. We can also pass `opt` to our [training loop](training.md), which will update all parameters of the model in a loop. However, we can now easily replace `Descent` with a more advanced optimiser such as `ADAM`. +An optimiser `update!` accepts a parameter and a gradient, and updates the parameter according to the chosen rule. We can also pass `opt` to our [training loop](training.md), which will update all parameters of the model in a loop. However, we can now easily replace `Descent` with a more advanced optimiser such as `Adam`. ## Optimiser Reference @@ -51,15 +51,15 @@ Descent Momentum Nesterov RMSProp -ADAM -RADAM +Adam +RAdam AdaMax -ADAGrad -ADADelta +AdaGrad +AdaDelta AMSGrad -NADAM -ADAMW -OADAM +NAdam +AdamW +OAdam AdaBelief ``` @@ -182,7 +182,7 @@ WeightDecay Gradient clipping is useful for training recurrent neural networks, which have a tendency to suffer from the exploding gradient problem. An example usage is ```julia -opt = Optimiser(ClipValue(1e-3), ADAM(1e-3)) +opt = Optimiser(ClipValue(1e-3), Adam(1e-3)) ``` ```@docs diff --git a/src/Flux.jl b/src/Flux.jl index a5eaec7ce4..0cacbd419a 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -29,9 +29,9 @@ include("optimise/Optimise.jl") using .Optimise using .Optimise: @epochs using .Optimise: skip -export Descent, ADAM, Momentum, Nesterov, RMSProp, - ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, OADAM, - ADAMW, RADAM, AdaBelief, InvDecay, ExpDecay, +export Descent, Adam, Momentum, Nesterov, RMSProp, + AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam, + AdamW, RAdam, AdaBelief, InvDecay, ExpDecay, WeightDecay, ClipValue, ClipNorm using CUDA diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl index 010cbfc9bb..e691ce0170 100644 --- a/src/optimise/Optimise.jl +++ b/src/optimise/Optimise.jl @@ -4,8 +4,8 @@ using LinearAlgebra import ArrayInterface export train!, update!, - Descent, ADAM, Momentum, Nesterov, RMSProp, - ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW,RADAM, OADAM, AdaBelief, + Descent, Adam, Momentum, Nesterov, RMSProp, + AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW,RAdam, OAdam, AdaBelief, InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser, ClipValue, ClipNorm diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index ce78586fff..a50e99eb35 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -147,9 +147,9 @@ function apply!(o::RMSProp, x, Δ) end """ - ADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS) + Adam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS) -[ADAM](https://arxiv.org/abs/1412.6980) optimiser. +[Adam](https://arxiv.org/abs/1412.6980) optimiser. # Parameters - Learning rate (`η`): Amount by which gradients are discounted before updating @@ -159,21 +159,21 @@ end # Examples ```julia -opt = ADAM() +opt = Adam() -opt = ADAM(0.001, (0.9, 0.8)) +opt = Adam(0.001, (0.9, 0.8)) ``` """ -mutable struct ADAM <: AbstractOptimiser +mutable struct Adam <: AbstractOptimiser eta::Float64 beta::Tuple{Float64,Float64} epsilon::Float64 state::IdDict{Any, Any} end -ADAM(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = ADAM(η, β, ϵ, IdDict()) -ADAM(η::Real, β::Tuple, state::IdDict) = ADAM(η, β, EPS, state) +Adam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = Adam(η, β, ϵ, IdDict()) +Adam(η::Real, β::Tuple, state::IdDict) = Adam(η, β, EPS, state) -function apply!(o::ADAM, x, Δ) +function apply!(o::Adam, x, Δ) η, β = o.eta, o.beta mt, vt, βp = get!(o.state, x) do @@ -189,9 +189,9 @@ function apply!(o::ADAM, x, Δ) end """ - RADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS) + RAdam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS) -[Rectified ADAM](https://arxiv.org/abs/1908.03265) optimizer. +[Rectified Adam](https://arxiv.org/abs/1908.03265) optimizer. # Parameters - Learning rate (`η`): Amount by which gradients are discounted before updating @@ -201,21 +201,21 @@ end # Examples ```julia -opt = RADAM() +opt = RAdam() -opt = RADAM(0.001, (0.9, 0.8)) +opt = RAdam(0.001, (0.9, 0.8)) ``` """ -mutable struct RADAM <: AbstractOptimiser +mutable struct RAdam <: AbstractOptimiser eta::Float64 beta::Tuple{Float64,Float64} epsilon::Float64 state::IdDict{Any, Any} end -RADAM(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = RADAM(η, β, ϵ, IdDict()) -RADAM(η::Real, β::Tuple, state::IdDict) = RADAM(η, β, EPS, state) +RAdam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = RAdam(η, β, ϵ, IdDict()) +RAdam(η::Real, β::Tuple, state::IdDict) = RAdam(η, β, EPS, state) -function apply!(o::RADAM, x, Δ) +function apply!(o::RAdam, x, Δ) η, β = o.eta, o.beta ρ∞ = 2/(1-β[2])-1 @@ -241,7 +241,7 @@ end """ AdaMax(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS) -[AdaMax](https://arxiv.org/abs/1412.6980) is a variant of ADAM based on the ∞-norm. +[AdaMax](https://arxiv.org/abs/1412.6980) is a variant of Adam based on the ∞-norm. # Parameters - Learning rate (`η`): Amount by which gradients are discounted before updating @@ -281,10 +281,10 @@ function apply!(o::AdaMax, x, Δ) end """ - OADAM(η = 0.0001, β::Tuple = (0.5, 0.9), ϵ = $EPS) + OAdam(η = 0.0001, β::Tuple = (0.5, 0.9), ϵ = $EPS) -[OADAM](https://arxiv.org/abs/1711.00141) (Optimistic ADAM) -is a variant of ADAM adding an "optimistic" term suitable for adversarial training. +[OAdam](https://arxiv.org/abs/1711.00141) (Optimistic Adam) +is a variant of Adam adding an "optimistic" term suitable for adversarial training. # Parameters - Learning rate (`η`): Amount by which gradients are discounted before updating @@ -294,21 +294,21 @@ is a variant of ADAM adding an "optimistic" term suitable for adversarial traini # Examples ```julia -opt = OADAM() +opt = OAdam() -opt = OADAM(0.001, (0.9, 0.995)) +opt = OAdam(0.001, (0.9, 0.995)) ``` """ -mutable struct OADAM <: AbstractOptimiser +mutable struct OAdam <: AbstractOptimiser eta::Float64 beta::Tuple{Float64,Float64} epsilon::Float64 state::IdDict{Any, Any} end -OADAM(η::Real = 0.001, β::Tuple = (0.5, 0.9), ϵ::Real = EPS) = OADAM(η, β, ϵ, IdDict()) -OADAM(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state) +OAdam(η::Real = 0.001, β::Tuple = (0.5, 0.9), ϵ::Real = EPS) = OAdam(η, β, ϵ, IdDict()) +OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state) -function apply!(o::OADAM, x, Δ) +function apply!(o::OAdam, x, Δ) η, β = o.eta, o.beta mt, vt, Δ_, βp = get!(o.state, x) do @@ -326,9 +326,9 @@ function apply!(o::OADAM, x, Δ) end """ - ADAGrad(η = 0.1, ϵ = $EPS) + AdaGrad(η = 0.1, ϵ = $EPS) -[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has +[AdaGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has parameter specific learning rates based on how frequently it is updated. Parameters don't need tuning. @@ -338,20 +338,20 @@ Parameters don't need tuning. # Examples ```julia -opt = ADAGrad() +opt = AdaGrad() -opt = ADAGrad(0.001) +opt = AdaGrad(0.001) ``` """ -mutable struct ADAGrad <: AbstractOptimiser +mutable struct AdaGrad <: AbstractOptimiser eta::Float64 epsilon::Float64 acc::IdDict end -ADAGrad(η::Real = 0.1, ϵ::Real = EPS) = ADAGrad(η, ϵ, IdDict()) -ADAGrad(η::Real, state::IdDict) = ADAGrad(η, EPS, state) +AdaGrad(η::Real = 0.1, ϵ::Real = EPS) = AdaGrad(η, ϵ, IdDict()) +AdaGrad(η::Real, state::IdDict) = AdaGrad(η, EPS, state) -function apply!(o::ADAGrad, x, Δ) +function apply!(o::AdaGrad, x, Δ) η = o.eta acc = get!(() -> fill!(similar(x), o.epsilon), o.acc, x)::typeof(x) @. acc += Δ * conj(Δ) @@ -361,7 +361,7 @@ end """ ADADelta(ρ = 0.9, ϵ = $EPS) -[ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning +[ADADelta](https://arxiv.org/abs/1212.5701) is a version of AdaGrad adapting its learning rate based on a window of past gradient updates. Parameters don't need tuning. @@ -397,7 +397,7 @@ end """ AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS) -The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM +The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the Adam optimiser. Parameters don't need tuning. # Parameters @@ -436,9 +436,9 @@ function apply!(o::AMSGrad, x, Δ) end """ - NADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS) + NAdam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS) -[NADAM](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of ADAM. +[NAdam](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of Adam. Parameters don't need tuning. # Parameters @@ -449,21 +449,21 @@ Parameters don't need tuning. # Examples ```julia -opt = NADAM() +opt = NAdam() -opt = NADAM(0.002, (0.89, 0.995)) +opt = NAdam(0.002, (0.89, 0.995)) ``` """ -mutable struct NADAM <: AbstractOptimiser +mutable struct NAdam <: AbstractOptimiser eta::Float64 beta::Tuple{Float64, Float64} epsilon::Float64 state::IdDict{Any, Any} end -NADAM(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = NADAM(η, β, ϵ, IdDict()) -NADAM(η::Real, β::Tuple, state::IdDict) = NADAM(η, β, EPS, state) +NAdam(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = NAdam(η, β, ϵ, IdDict()) +NAdam(η::Real, β::Tuple, state::IdDict) = NAdam(η, β, EPS, state) -function apply!(o::NADAM, x, Δ) +function apply!(o::NAdam, x, Δ) η, β = o.eta, o.beta mt, vt, βp = get!(o.state, x) do @@ -480,9 +480,9 @@ function apply!(o::NADAM, x, Δ) end """ - ADAMW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0) + AdamW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0) -[ADAMW](https://arxiv.org/abs/1711.05101) is a variant of ADAM fixing (as in repairing) its +[AdamW](https://arxiv.org/abs/1711.05101) is a variant of Adam fixing (as in repairing) its weight decay regularization. # Parameters @@ -494,19 +494,19 @@ weight decay regularization. # Examples ```julia -opt = ADAMW() +opt = AdamW() -opt = ADAMW(0.001, (0.89, 0.995), 0.1) +opt = AdamW(0.001, (0.89, 0.995), 0.1) ``` """ -ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) = - Optimiser(ADAM(η, β), WeightDecay(decay)) +AdamW(η = 0.001, β = (0.9, 0.999), decay = 0) = + Optimiser(Adam(η, β), WeightDecay(decay)) """ AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS) The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known -ADAM optimiser. +Adam optimiser. # Parameters - Learning rate (`η`): Amount by which gradients are discounted before updating @@ -537,7 +537,7 @@ function apply!(o::AdaBelief, x, Δ) (zero(x), zero(x), Float64[β[1], β[2]]) end :: Tuple{typeof(x), typeof(x), Vector{Float64}} - #= st is a variance and can go to zero. This is in contrast to ADAM, which uses the + #= st is a variance and can go to zero. This is in contrast to Adam, which uses the second moment which is usually far enough from zero. This is problematic, since st can be slightly negative due to numerical error, and the square root below will fail. Also, if we want to differentiate through the optimizer, √0 is not differentiable. @@ -643,10 +643,10 @@ for more general scheduling techniques. `ExpDecay` is typically composed with other optimizers as the last transformation of the gradient: ```julia -opt = Optimiser(ADAM(), ExpDecay(1.0)) +opt = Optimiser(Adam(), ExpDecay(1.0)) ``` Note: you may want to start with `η=1` in `ExpDecay` when combined with other -optimizers (`ADAM` in this case) that have their own learning rate. +optimizers (`Adam` in this case) that have their own learning rate. """ mutable struct ExpDecay <: AbstractOptimiser eta::Float64 @@ -681,7 +681,7 @@ with coefficient ``λ`` to the loss. # Examples ```julia -opt = Optimiser(WeightDecay(1f-4), ADAM()) +opt = Optimiser(WeightDecay(1f-4), Adam()) ``` """ mutable struct WeightDecay <: AbstractOptimiser diff --git a/test/optimise.jl b/test/optimise.jl index 9c358a6825..e922d3c0b8 100644 --- a/test/optimise.jl +++ b/test/optimise.jl @@ -10,8 +10,8 @@ using Random # so that w and w' are different Random.seed!(84) w = randn(10, 10) - @testset for opt in [ADAMW(), ADAGrad(0.1), AdaMax(), ADADelta(0.9), AMSGrad(), - NADAM(), RADAM(), Descent(0.1), ADAM(), OADAM(), AdaBelief(), + @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(), + NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(), Nesterov(), RMSProp(), Momentum()] Random.seed!(42) w′ = randn(10, 10) @@ -34,7 +34,7 @@ end Random.seed!(42) w′ = randn(10, 10) loss(x) = Flux.Losses.mse(w*x, w′*x) - opt = Optimiser(Opt(), ADAM(0.001)) + opt = Optimiser(Opt(), Adam(0.001)) for t = 1:10^5 θ = Params([w′]) x = rand(10) @@ -202,7 +202,7 @@ end end # Flux PR #1776 -# We need to test that optimisers like ADAM that maintain an internal momentum +# We need to test that optimisers like Adam that maintain an internal momentum # estimate properly calculate the second-order statistics on the gradients as # the flow backward through the model. Previously, we would calculate second- # order statistics via `Δ^2` rather than the complex-aware `Δ * conj(Δ)`, which @@ -210,7 +210,7 @@ end # a simple optimization is montonically decreasing (up to learning step effects) @testset "Momentum Optimisers and complex values" begin # Test every optimizer that has momentum internally - for opt_ctor in [ADAM, RMSProp, RADAM, OADAM, ADAGrad, ADADelta, NADAM, AdaBelief] + for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief] # Our "model" is just a complex number w = zeros(ComplexF32, 1) From 6b935bd92dcf64c47b2c6cf5200cf3f2fa49345f Mon Sep 17 00:00:00 2001 From: karthikdk Date: Wed, 15 Jun 2022 16:05:06 +0530 Subject: [PATCH 2/5] add deprecations --- src/deprecations.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/deprecations.jl b/src/deprecations.jl index eb1f2fdcda..a787baf678 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -71,3 +71,12 @@ LSTMCell(in::Integer, out::Integer; kw...) = LSTMCell(in => out; kw...) GRUCell(in::Integer, out::Integer; kw...) = GRUCell(in => out; kw...) GRUv3Cell(in::Integer, out::Integer; kw...) = GRUv3Cell(in => out; kw...) + +# Deprecate Optimisers with old naming convention +@deprecate ADAM Adam +@deprecate NADAM NAdam +@deprecate ADAMW AdamW +@deprecate RADAM RAdam +@deprecate OADAM OAdam +@deprecate ADAGrad AdaGrad +@deprecate ADADelta AdaDelta From 65add3f640104fe9f018ac410ecce4cbbfd763a1 Mon Sep 17 00:00:00 2001 From: karthikdk Date: Wed, 15 Jun 2022 19:24:04 +0530 Subject: [PATCH 3/5] change ADADelta to AdaDelta --- src/optimise/optimisers.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index a50e99eb35..ce72a4b0ce 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -359,9 +359,9 @@ function apply!(o::AdaGrad, x, Δ) end """ - ADADelta(ρ = 0.9, ϵ = $EPS) + AdaDelta(ρ = 0.9, ϵ = $EPS) -[ADADelta](https://arxiv.org/abs/1212.5701) is a version of AdaGrad adapting its learning +[AdaDelta](https://arxiv.org/abs/1212.5701) is a version of AdaGrad adapting its learning rate based on a window of past gradient updates. Parameters don't need tuning. @@ -370,20 +370,20 @@ Parameters don't need tuning. # Examples ```julia -opt = ADADelta() +opt = AdaDelta() -opt = ADADelta(0.89) +opt = AdaDelta(0.89) ``` """ -mutable struct ADADelta <: AbstractOptimiser +mutable struct AdaDelta <: AbstractOptimiser rho::Float64 epsilon::Float64 state::IdDict{Any, Any} end -ADADelta(ρ::Real = 0.9, ϵ::Real = EPS) = ADADelta(ρ, ϵ, IdDict()) -ADADelta(ρ::Real, state::IdDict) = ADADelta(ρ, EPS, state) +AdaDelta(ρ::Real = 0.9, ϵ::Real = EPS) = AdaDelta(ρ, ϵ, IdDict()) +AdaDelta(ρ::Real, state::IdDict) = AdaDelta(ρ, EPS, state) -function apply!(o::ADADelta, x, Δ) +function apply!(o::AdaDelta, x, Δ) ρ = o.rho acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2,typeof(x)} @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ) From 31a9950d312eaffb939d144e7ba4a78eb5208c0b Mon Sep 17 00:00:00 2001 From: Brian Chen Date: Wed, 15 Jun 2022 10:49:46 -0700 Subject: [PATCH 4/5] Tweak deprecation comment --- src/deprecations.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deprecations.jl b/src/deprecations.jl index a787baf678..9e99af0271 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -72,7 +72,7 @@ LSTMCell(in::Integer, out::Integer; kw...) = LSTMCell(in => out; kw...) GRUCell(in::Integer, out::Integer; kw...) = GRUCell(in => out; kw...) GRUv3Cell(in::Integer, out::Integer; kw...) = GRUv3Cell(in => out; kw...) -# Deprecate Optimisers with old naming convention +# Optimisers with old naming convention @deprecate ADAM Adam @deprecate NADAM NAdam @deprecate ADAMW AdamW From 764014903a14e659a91d6ac854f5aa4b3c2f0500 Mon Sep 17 00:00:00 2001 From: Brian Chen Date: Wed, 15 Jun 2022 18:43:14 -0700 Subject: [PATCH 5/5] Use `@deprecate_binding` instead This should preserve the type as a type and fix downstream dispatches. --- src/deprecations.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/deprecations.jl b/src/deprecations.jl index 9e99af0271..6719bd39e2 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -73,10 +73,10 @@ GRUCell(in::Integer, out::Integer; kw...) = GRUCell(in => out; kw...) GRUv3Cell(in::Integer, out::Integer; kw...) = GRUv3Cell(in => out; kw...) # Optimisers with old naming convention -@deprecate ADAM Adam -@deprecate NADAM NAdam -@deprecate ADAMW AdamW -@deprecate RADAM RAdam -@deprecate OADAM OAdam -@deprecate ADAGrad AdaGrad -@deprecate ADADelta AdaDelta +Base.@deprecate_binding ADAM Adam +Base.@deprecate_binding NADAM NAdam +Base.@deprecate_binding ADAMW AdamW +Base.@deprecate_binding RADAM RAdam +Base.@deprecate_binding OADAM OAdam +Base.@deprecate_binding ADAGrad AdaGrad +Base.@deprecate_binding ADADelta AdaDelta