From d42a408a8367265be96e4dbffb660a6dc5968cb6 Mon Sep 17 00:00:00 2001
From: Penelope Yong <penelopeysm@gmail.com>
Date: Tue, 26 Nov 2024 18:10:53 +0000
Subject: [PATCH] Update to Turing v0.35 (#559)

* Add readme note on Julia version

* Bump Turing to 0.35

* Update minimum supported Julia version

* Remove unnecessary version qualifier

* Remove Tracker and replace with Mooncake, except in BNN doc

* Use Mooncake in BNN doc (#521)

* Fix BNN doc to work with Mooncake
---
 Manifest.toml                                 | 39 ++++++++++++++++++-
 Project.toml                                  |  2 +-
 README.md                                     | 17 ++++++++
 .../03-bayesian-neural-network/index.qmd      |  6 +--
 .../index.qmd                                 |  2 +-
 tutorials/docs-00-getting-started/index.qmd   |  2 +-
 .../docs-10-using-turing-autodiff/index.qmd   |  8 ++--
 .../docs-12-using-turing-guide/index.qmd      |  4 +-
 .../index.qmd                                 | 11 +++---
 9 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 83ba4bd2d..ec0434727 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.10.6"
 manifest_format = "2.0"
-project_hash = "52677c2802b9702303d49a2f40fdefa34939b429"
+project_hash = "53e97ef537fe2e98eecbe367cd36c4bc6a201e3e"
 
 [[deps.ADTypes]]
 git-tree-sha1 = "72af59f5b8f09faee36b4ec48e014a79210f2f4f"
@@ -708,6 +708,12 @@ git-tree-sha1 = "23163d55f885173722d1e4cf0f6110cdbaf7e272"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
 version = "1.15.1"
 
+[[deps.DiffTests]]
+deps = ["LinearAlgebra", "SparseArrays", "Statistics"]
+git-tree-sha1 = "b92beb1933df01bf4915d3a05e54c2a0aad312c7"
+uuid = "de460e47-3fe3-5279-bb4a-814414816d5d"
+version = "0.1.2"
+
 [[deps.DifferentialEquations]]
 deps = ["BoundaryValueDiffEq", "DelayDiffEq", "DiffEqBase", "DiffEqCallbacks", "DiffEqNoiseProcess", "JumpProcesses", "LinearAlgebra", "LinearSolve", "NonlinearSolve", "OrdinaryDiffEq", "Random", "RecursiveArrayTools", "Reexport", "SciMLBase", "SteadyStateDiffEq", "StochasticDiffEq", "Sundials"]
 git-tree-sha1 = "d55af9d6b51c54f81ae30d1a463206d32cc4c24a"
@@ -2109,6 +2115,11 @@ git-tree-sha1 = "ec4f7fbeab05d7747bdf98eb74d130a2a2ed298d"
 uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 version = "1.2.0"
 
+[[deps.MistyClosures]]
+git-tree-sha1 = "1142aefd845c608f3c70e4c202c4aae725cab67b"
+uuid = "dbe65cb8-6be2-42dd-bbc5-4196aaced4f4"
+version = "2.0.0"
+
 [[deps.Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
@@ -2118,6 +2129,32 @@ git-tree-sha1 = "2c140d60d7cb82badf06d8783800d0bcd1a7daa2"
 uuid = "78c3b35d-d492-501b-9361-3d52fe80e533"
 version = "0.8.1"
 
+[[deps.Mooncake]]
+deps = ["ADTypes", "ChainRules", "ChainRulesCore", "DiffRules", "DiffTests", "ExprTools", "FunctionWrappers", "Graphs", "InteractiveUtils", "LinearAlgebra", "MistyClosures", "Random", "Setfield", "Test"]
+git-tree-sha1 = "9fa3c8c11fe5286badb9c24e8d2420b3a07f9a8c"
+uuid = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
+version = "0.4.50"
+
+    [deps.Mooncake.extensions]
+    MooncakeAllocCheckExt = "AllocCheck"
+    MooncakeCUDAExt = "CUDA"
+    MooncakeDynamicPPLExt = "DynamicPPL"
+    MooncakeJETExt = "JET"
+    MooncakeLuxLibExt = "LuxLib"
+    MooncakeLuxLibSLEEFPiratesExtension = ["LuxLib", "SLEEFPirates"]
+    MooncakeNNlibExt = "NNlib"
+    MooncakeSpecialFunctionsExt = "SpecialFunctions"
+
+    [deps.Mooncake.weakdeps]
+    AllocCheck = "9b6a8646-10ed-4001-bbdc-1d2f46dfbb1a"
+    CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+    DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
+    JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
+    LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
+    NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+    SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
+    SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+
 [[deps.MozillaCACerts_jll]]
 uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
 version = "2023.1.10"
diff --git a/Project.toml b/Project.toml
index b14761bea..7b850e84b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -33,6 +33,7 @@ MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Measures = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
 Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73"
 MicroCanonicalHMC = "234d2aa0-2291-45f7-9047-6fa6f316b0a8"
+Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 OptimizationNLopt = "4e6fcdb7-1186-4e1f-a706-475e75c168bb"
@@ -48,7 +49,6 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
-Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
 TuringBenchmarking = "0db1332d-5c25-4deb-809f-459bc696f94f"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
diff --git a/README.md b/README.md
index bf7a0078c..a19b64e15 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,17 @@ This repository is part of [Turing.jl's](https://turinglang.org/) website (i.e.
 - The `master` branch contains the quarto source 
 - The `gh-pages` branch contains the `html` version of these documents compiled from the `master` branch.
 
+> [!NOTE]  
+> Due to [an incompatibility between Bijectors.jl and
+> Enzyme.jl](https://github.com/TuringLang/Bijectors.jl/pull/341), the docs
+> currently must be built with Julia 1.10. You can do this either by making
+> Julia 1.10 your default Julia version (`juliaup default 1.10`), or by
+> explicitly specifying Julia 1.10 when running Quarto:
+>
+> ```bash
+> QUARTO_JULIA=$(julia +1.10 -e "println(Sys.BINDIR)")/julia quarto render
+> ```
+
 ## Local development
 
 To get started with the docs website locally, you'll need to have [Quarto](https://quarto.org/docs/download/) installed.
@@ -79,6 +90,12 @@ If you find that Quarto's execution is failing with errors that aren't reproduci
 quarto render /path/to/index.qmd --execute-daemon-restart
 ```
 
+And also, kill any stray Quarto processes that are still running (sometimes it keeps running in the background):
+
+```bash
+pkill -9 -f quarto
+```
+
 ## License
 
 This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
diff --git a/tutorials/03-bayesian-neural-network/index.qmd b/tutorials/03-bayesian-neural-network/index.qmd
index fcfa7502f..96a8816d1 100755
--- a/tutorials/03-bayesian-neural-network/index.qmd
+++ b/tutorials/03-bayesian-neural-network/index.qmd
@@ -19,7 +19,7 @@ using Turing
 using FillArrays
 using Lux
 using Plots
-using Tracker
+import Mooncake
 using Functors
 
 using LinearAlgebra
@@ -189,7 +189,7 @@ const nn = StatefulLuxLayer{true}(nn_initial, nothing, st)
     parameters ~ MvNormal(zeros(nparameters), Diagonal(abs2.(sigma .* ones(nparameters))))
 
     # Forward NN to make predictions
-    preds = Lux.apply(nn, xs, vector_to_parameters(parameters, ps))
+    preds = Lux.apply(nn, xs, f32(vector_to_parameters(parameters, ps)))
 
     # Observe each prediction.
     for i in eachindex(ts)
@@ -208,7 +208,7 @@ setprogress!(false)
 ```{julia}
 # Perform inference.
 N = 2_000
-ch = sample(bayes_nn(reduce(hcat, xs), ts), NUTS(; adtype=AutoTracker()), N);
+ch = sample(bayes_nn(reduce(hcat, xs), ts), NUTS(; adtype=AutoMooncake(; config=nothing)), N);
 ```
 
 Now we extract the parameter samples from the sampled chain as `θ` (this is of size `5000 x 20` where `5000` is the number of iterations and `20` is the number of parameters).
diff --git a/tutorials/10-bayesian-differential-equations/index.qmd b/tutorials/10-bayesian-differential-equations/index.qmd
index 9070ea5a3..b7ff4ae0f 100755
--- a/tutorials/10-bayesian-differential-equations/index.qmd
+++ b/tutorials/10-bayesian-differential-equations/index.qmd
@@ -320,7 +320,7 @@ More theoretical details on these methods can be found at: https://docs.sciml.ai
 While these sensitivity analysis methods may seem complicated, using them is dead simple.
 Here is a version of the Lotka-Volterra model using adjoint sensitivities.
 
-All we have to do is switch the AD backend to one of the adjoint-compatible backends (ReverseDiff, Tracker, or Zygote)!
+All we have to do is switch the AD backend to one of the adjoint-compatible backends (ReverseDiff or Zygote)!
 Notice that on this model adjoints are slower.
 This is because adjoints have a higher overhead on small parameter models and therefore we suggest using these methods only for models with around 100 parameters or more.
 For more details, see https://arxiv.org/abs/1812.01892.
diff --git a/tutorials/docs-00-getting-started/index.qmd b/tutorials/docs-00-getting-started/index.qmd
index a6729ebd7..eb97dede3 100644
--- a/tutorials/docs-00-getting-started/index.qmd
+++ b/tutorials/docs-00-getting-started/index.qmd
@@ -16,7 +16,7 @@ Pkg.instantiate();
 
 To use Turing, you need to install Julia first and then install Turing.
 
-You will need to install Julia 1.7 or greater, which you can get from [the official Julia website](http://julialang.org/downloads/).
+You will need to install Julia 1.10 or greater, which you can get from [the official Julia website](http://julialang.org/downloads/).
 
 Turing is officially registered in the [Julia General package registry](https://github.com/JuliaRegistries/General), which means that you can install a stable version of Turing by running the following in the Julia REPL:
 
diff --git a/tutorials/docs-10-using-turing-autodiff/index.qmd b/tutorials/docs-10-using-turing-autodiff/index.qmd
index d5753447e..48541ab5a 100755
--- a/tutorials/docs-10-using-turing-autodiff/index.qmd
+++ b/tutorials/docs-10-using-turing-autodiff/index.qmd
@@ -12,9 +12,8 @@ Pkg.instantiate();
 
 ## Switching AD Modes
 
-Turing currently supports four automatic differentiation (AD) backends for sampling: [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) for forward-mode AD; and [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl), [Zygote](https://github.com/FluxML/Zygote.jl), and [Tracker](https://github.com/FluxML/Tracker.jl) for reverse-mode AD.
-While `Tracker` is still available, its use is discouraged due to a lack of active maintenance.
-`ForwardDiff` is automatically imported by Turing. To utilize `Zygote` or `ReverseDiff` for AD, users must explicitly import them with `using Zygote` or `using ReverseDiff`, alongside `using Turing`.
+Turing currently supports four automatic differentiation (AD) backends for sampling: [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) for forward-mode AD; and [Mooncake](https://github.com/compintell/Mooncake.jl), [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl), and [Zygote](https://github.com/FluxML/Zygote.jl) for reverse-mode AD.
+`ForwardDiff` is automatically imported by Turing. To utilize `Mooncake`, `Zygote`, or `ReverseDiff` for AD, users must explicitly import them with `import Mooncake`, `import Zygote` or `import ReverseDiff`, alongside `using Turing`.
 
 As of Turing version v0.30, the global configuration flag for the AD backend has been removed in favour of [`AdTypes.jl`](https://github.com/SciML/ADTypes.jl), allowing users to specify the AD backend for individual samplers independently.
 Users can pass the `adtype` keyword argument to the sampler constructor to select the desired AD backend, with the default being `AutoForwardDiff(; chunksize=0)`.
@@ -69,7 +68,8 @@ Generally, reverse-mode AD, for instance `ReverseDiff`, is faster when sampling
 If the differentiation method is not specified in this way, Turing will default to using whatever the global AD backend is.
 Currently, this defaults to `ForwardDiff`.
 
-The most reliable way to ensure you are using the fastest AD that works for your problem is to benchmark them using `TuringBenchmarking`:
+The most reliable way to ensure you are using the fastest AD that works for your problem is to benchmark them using [`TuringBenchmarking`](https://github.com/TuringLang/TuringBenchmarking.jl):
+
 ```{julia}
 using TuringBenchmarking
 benchmark_model(gdemo(1.5, 2), adbackends=[AutoForwardDiff(), AutoReverseDiff()])
diff --git a/tutorials/docs-12-using-turing-guide/index.qmd b/tutorials/docs-12-using-turing-guide/index.qmd
index b6f16ac81..565440296 100755
--- a/tutorials/docs-12-using-turing-guide/index.qmd
+++ b/tutorials/docs-12-using-turing-guide/index.qmd
@@ -166,7 +166,7 @@ The `chains` variable now contains a `Chains` object which can be indexed by cha
 
 #### Multithreaded sampling
 
-If you wish to perform multithreaded sampling and are running Julia 1.3 or greater, you can call `sample` with the following signature:
+If you wish to perform multithreaded sampling, you can call `sample` with the following signature:
 
 ```{julia}
 #| eval: false
@@ -514,7 +514,7 @@ ForwardDiff (Turing's default AD backend) uses forward-mode chunk-wise AD. The c
 
 #### AD Backend
 
-Turing supports four automatic differentiation (AD) packages in the back end during sampling. The default AD backend is [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) for forward-mode AD. Three reverse-mode AD backends are also supported, namely [Tracker](https://github.com/FluxML/Tracker.jl), [Zygote](https://github.com/FluxML/Zygote.jl) and [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl). `Zygote` and `ReverseDiff` are supported optionally if explicitly loaded by the user with `using Zygote` or `using ReverseDiff` next to `using Turing`.
+Turing supports four automatic differentiation (AD) packages in the back end during sampling. The default AD backend is [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) for forward-mode AD. Three reverse-mode AD backends are also supported, namely [Mooncake](https://github.com/compintell/Mooncake.jl), [Zygote](https://github.com/FluxML/Zygote.jl) and [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl). `Mooncake`, `Zygote`, and `ReverseDiff` also require the user to explicitly load them using `import Mooncake`, `import Zygote`, or `import ReverseDiff` next to `using Turing`.
 
 For more information on Turing's automatic differentiation backend, please see the [Automatic Differentiation]({{<meta using-turing-autodiff>}}) article.
 
diff --git a/tutorials/docs-13-using-turing-performance-tips/index.qmd b/tutorials/docs-13-using-turing-performance-tips/index.qmd
index 9dd73d752..4eb9ba42b 100755
--- a/tutorials/docs-13-using-turing-performance-tips/index.qmd
+++ b/tutorials/docs-13-using-turing-performance-tips/index.qmd
@@ -43,9 +43,10 @@ end
 ## Choose your AD backend
 
 Automatic differentiation (AD) makes it possible to use modern, efficient gradient-based samplers like NUTS and HMC, and that means a good AD system is incredibly important. Turing currently
-supports several AD backends, including [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) (the default), [Zygote](https://github.com/FluxML/Zygote.jl),
-[ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl), and [Tracker](https://github.com/FluxML/Tracker.jl). Experimental support is also available for
-[Tapir](https://github.com/withbayes/Tapir.jl).
+supports several AD backends, including [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) (the default),
+[Mooncake](https://github.com/compintell/Mooncake.jl),
+[Zygote](https://github.com/FluxML/Zygote.jl), and
+[ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl).
 
 For many common types of models, the default ForwardDiff backend performs great, and there is no need to worry about changing it. However, if you need more speed, you can try
 different backends via the standard [ADTypes](https://github.com/SciML/ADTypes.jl) interface by passing an `AbstractADType` to the sampler with the optional `adtype` argument, e.g.
@@ -53,9 +54,9 @@ different backends via the standard [ADTypes](https://github.com/SciML/ADTypes.j
 few parameters (say, less than 20 or so), while reverse-mode backends such as `AutoZygote()` or `AutoReverseDiff()` will perform better for models with many parameters or linear algebra
 operations. If in doubt, it's easy to try a few different backends to see how they compare.
 
-### Special care for Zygote and Tracker
+### Special care for Zygote
 
-Note that Zygote and Tracker will not perform well if your model contains `for`-loops, due to the way reverse-mode AD is implemented in these packages. Zygote also cannot differentiate code
+Note that Zygote will not perform well if your model contains `for`-loops, due to the way reverse-mode AD is implemented in these packages. Zygote also cannot differentiate code
 that contains mutating operations. If you can't implement your model without `for`-loops or mutation, `ReverseDiff` will be a better, more performant option. In general, though,
 vectorized operations are still likely to perform best.