diff --git a/.buildkite/distributed/pipeline.yml b/.buildkite/distributed/pipeline.yml
index f23b10c28d..0c06cbb02f 100644
--- a/.buildkite/distributed/pipeline.yml
+++ b/.buildkite/distributed/pipeline.yml
@@ -3,34 +3,26 @@ agents:
   slurm_mem: 8G
   modules: climacommon/2024_05_27
 
-
 env:
   JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite/distributed"
   OPENBLAS_NUM_THREADS: 1
+  JULIA_PKG_SERVER_REGISTRY_PREFERENCE: eager
+  JULIA_NUM_PRECOMPILE_TASKS: 8
+  JULIA_NUM_THREADS: 8
   OMPI_MCA_opal_warn_on_missing_libcuda: 0
 
 steps:
   - label: "initialize"
     key: "init_central"
+    env:
+      TEST_GROUP: "init"
     command:
       - echo "--- Instantiate project"
-      - "julia --project -e 'using Pkg; Pkg.instantiate(;verbose=true); Pkg.precompile(;strict=true)'"
-      # force the initialization of the CUDA runtime as it is lazily loaded by default
-      - "julia --project -e 'using CUDA; CUDA.precompile_runtime(); CUDA.versioninfo()'"
-      - "julia --project -e 'using MPI; MPI.versioninfo()'"
-      # Download artifacts by running an empty testgroup and thereby executing /test/runtests.jl
       - "julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
-
-      - echo "--- Instantiate status"
-      - "julia --project -e 'using Pkg; Pkg.status()'"
-
     agents:
       slurm_mem: 120G
       slurm_gpus: 1
       slurm_cpus_per_task: 8
-    env:
-      JULIA_NUM_PRECOMPILE_TASKS: 8
-      TEST_GROUP: "init"
 
   - wait
 
@@ -54,6 +46,11 @@ steps:
       slurm_mem: 120G
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
+
 
   - label: "🦾 cpu distributed solvers tests"
     key: "distributed_solvers_cpu"
@@ -65,6 +62,21 @@ steps:
       slurm_mem: 120G
       slurm_ntasks: 4
 
+  - label: "🛸 gpu distributed solvers tests"
+    key: "distributed_solvers_gpu"
+    env:
+      TEST_GROUP: "distributed_solvers"
+    commands:
+      - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
+    agents:
+      slurm_mem: 120G
+      slurm_ntasks: 4
+      slurm_gpus_per_task: 1
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
+
   - label: "🤺 cpu distributed hydrostatic model tests"
     key: "distributed_hydrostatic_model_cpu"
     env:
@@ -75,7 +87,7 @@ steps:
       slurm_mem: 120G
       slurm_ntasks: 4
 
-  - label: "🕺 gpu distributed hydrostatic model tests"
+  - label: "🦏 gpu distributed hydrostatic model tests"
     key: "distributed_hydrostatic_model_gpu"
     env:
       TEST_GROUP: "distributed_hydrostatic_model"
@@ -85,7 +97,36 @@ steps:
       slurm_mem: 120G
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
-    
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
+
+  - label: "🦍 cpu distributed nonhydrostatic regression"
+    key: "distributed_nonhydrostatic_regression_cpu"
+    env:
+      TEST_GROUP: "distributed_nonhydrostatic_regression"
+    commands:
+      - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
+    agents:
+      slurm_mem: 120G
+      slurm_ntasks: 4
+
+  - label: "🕺 gpu distributed nonhydrostatic regression"
+    key: "distributed_nonhydrostatic_regression_gpu"
+    env:
+      TEST_GROUP: "distributed_nonhydrostatic_regression"
+    commands:
+      - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
+    agents:
+      slurm_mem: 120G
+      slurm_ntasks: 4
+      slurm_gpus_per_task: 1
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
+
   - wait
 
   - label: ":chart_with_downwards_trend: build history"
@@ -93,3 +134,4 @@ steps:
       - "build_history staging"
     artifact_paths:
       - "build_history.html"
+
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index f03245b46f..32ae563fae 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,9 +1,11 @@
 env:
-  JULIA_VERSION: "1.10.2"
+  JULIA_VERSION: "1.10.5"
   JULIA_MINOR_VERSION: "1.10"
   SVERDRUP_HOME: "/data5/glwagner"
   TARTARUS_HOME: "/storage5/buildkite-agent"
   JULIA_PKG_SERVER_REGISTRY_PREFERENCE: eager
+  JULIA_NUM_PRECOMPILE_TASKS: 8
+  JULIA_NUM_THREADS: 8
 
 steps:
   - label: "🎪 initialize gpu environment"
@@ -15,46 +17,35 @@ steps:
       # Download julia binaries
       - "wget -N -P $SVERDRUP_HOME https://julialang-s3.julialang.org/bin/linux/x64/$JULIA_MINOR_VERSION/julia-$JULIA_VERSION-linux-x86_64.tar.gz"
       - "tar xf $SVERDRUP_HOME/julia-$JULIA_VERSION-linux-x86_64.tar.gz -C $SVERDRUP_HOME"
-
-      # Instantiate and precompile
-      - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.instantiate(; verbose=true)'"
-      - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.build()'"
-      - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.precompile()'"
-      - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.status()'"
-
-      # Force the initialization of the CUDA runtime as it is lazily loaded by default
-      - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using CUDA; try CUDA.versioninfo(); catch; end'"
-
-      # Download artifacts by running an empty testgroup and thereby executing /test/runtests.jl
       - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
 
   - label: "🏕️ initialize cpu environment"
     key: "init_cpu"
     env:
       JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER"
       TEST_GROUP: "init"
-      CUDA_VISIBLE_DEVICES: "-1"
+      CUDA_VISIBLE_DEVICES: "0"
       JULIA_BINDIR: "$TARTARUS_HOME/julia-$JULIA_VERSION/bin"
       TMPDIR: "$TARTARUS_HOME/tmp"
     commands:
       # Download julia binaries
       - "wget -N -P $TARTARUS_HOME https://julialang-s3.julialang.org/bin/linux/x64/$JULIA_MINOR_VERSION/julia-$JULIA_VERSION-linux-x86_64.tar.gz"
       - "tar xf $TARTARUS_HOME/julia-$JULIA_VERSION-linux-x86_64.tar.gz -C $TARTARUS_HOME"
-
-      # Instantiate, precompile, and download artifacts by running an empty testgroup and thereby executing /test/runtests.jl
-      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.instantiate(; verbose=true)'"
-      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.precompile()'"
-      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.status()'"
       - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
-
-      # Set up the mpiexecjl command
-      - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using MPI; MPI.install_mpiexecjl()'"
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
 
 #####
 ##### Unit tests
@@ -69,6 +60,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🐇 cpu unit tests"
@@ -81,6 +76,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -96,6 +95,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🕊️ cpu poisson solver tests 1"
@@ -108,6 +111,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
   - label: "🦖 gpu poisson solver tests 2"
@@ -119,6 +126,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🦕 cpu poisson solver tests 2"
@@ -131,6 +142,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
   - label: "🌷 gpu matrix poisson solver tests"
@@ -142,6 +157,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🌹 cpu matrix poisson solver tests"
@@ -154,6 +173,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
   - label: "🦤 gpu general solver tests"
@@ -165,6 +188,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🦃 cpu general solver tests"
@@ -177,6 +204,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -192,6 +223,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🦞 cpu time stepping tests 1"
@@ -204,6 +239,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -219,6 +258,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🐬 cpu time stepping tests 2"
@@ -231,6 +274,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -246,6 +293,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🦗 cpu time stepping tests 3"
@@ -258,6 +309,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -273,6 +328,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🎏 cpu turbulence closures"
@@ -285,6 +344,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -300,6 +363,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🦑 cpu hydrostatic free surface model tests"
@@ -312,6 +379,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -327,6 +398,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🦆 cpu shallow water model tests"
@@ -339,6 +414,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -354,6 +433,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🐋 cpu simulation tests"
@@ -366,6 +449,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -381,6 +468,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🍃 cpu lagrangian particles tests"
@@ -393,6 +484,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -408,6 +503,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🤖 cpu abstract operations tests"
@@ -420,6 +519,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -435,6 +538,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🧄 cpu multi region tests"
@@ -447,6 +554,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -462,6 +573,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🐪 cpu nonhydrostatic regression tests"
@@ -474,6 +589,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
   - label: "🙈 gpu hydrostatic regression tests"
@@ -485,6 +604,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🙉 cpu hydrostatic regression tests"
@@ -497,6 +620,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -512,6 +639,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "🦍 cpu scripts"
@@ -524,6 +655,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -539,6 +674,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: GPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_gpu"
 
   - label: "👹 cpu Enzyme extension tests"
@@ -551,6 +690,10 @@ steps:
     agents:
       queue: Oceananigans
       architecture: CPU
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
     depends_on: "init_cpu"
 
 #####
@@ -560,7 +703,7 @@ steps:
   - label: "🦉 documentation"
     env:
       JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER"
-      CUDA_VISIBLE_DEVICES: "-1"
+      CUDA_VISIBLE_DEVICES: "0"
       JULIA_DEBUG: "Documenter"
       TMPDIR: "$TARTARUS_HOME/tmp"
     commands:
diff --git a/.gitignore b/.gitignore
index 5da9079618..8a9d661c88 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 *.jl.mem
 *.mem
 deps/deps.jl
+/Manifest.toml
 
 # Swap files.
 *.swp
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 18b7aa3b7b..34ddf2757c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,7 +20,7 @@ The simplest way to contribute to Oceananigans is to create or comment on issues
 
 The most useful bug reports:
 
-* Provide an explicit code snippet --- not just a link --- that reproduces the bug in the latest tagged version of Oceananigans. This is sometimes called the ["minimal working example"](https://en.wikipedia.org/wiki/Minimal_working_example). Reducing bug-producing code to a minimal example can dramatically decrease the time it takes to resolve an issue.
+* Provide an explicit code snippet -- not just a link -- that reproduces the bug in the latest tagged version of Oceananigans. This is sometimes called the ["minimal working example"](https://en.wikipedia.org/wiki/Minimal_working_example). Reducing bug-producing code to a minimal example can dramatically decrease the time it takes to resolve an issue.
 
 * Paste the _entire_ error received when running the code snippet, even if it's unbelievably long.
 
diff --git a/Dockerfile b/Dockerfile
index 8f11c762ff..3c088bc329 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM julia:1.10.2
+FROM julia:1.10.5
 LABEL maintainer="Ali Ramadhan <alir@mit.edu>"
 
 RUN apt-get update && apt-get install -y hdf5-tools
diff --git a/Manifest.toml b/Manifest.toml
deleted file mode 100644
index 961740127e..0000000000
--- a/Manifest.toml
+++ /dev/null
@@ -1,1112 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-julia_version = "1.10.3"
-manifest_format = "2.0"
-project_hash = "04d395caf937b0921325a77873167e8baa293a99"
-
-[[deps.AbstractFFTs]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
-uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "1.5.0"
-
-    [deps.AbstractFFTs.extensions]
-    AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
-    AbstractFFTsTestExt = "Test"
-
-    [deps.AbstractFFTs.weakdeps]
-    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-    Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[[deps.Adapt]]
-deps = ["LinearAlgebra", "Requires"]
-git-tree-sha1 = "6a55b747d1812e699320963ffde36f1ebdda4099"
-uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "4.0.4"
-weakdeps = ["StaticArrays"]
-
-    [deps.Adapt.extensions]
-    AdaptStaticArraysExt = "StaticArrays"
-
-[[deps.ArgTools]]
-uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
-version = "1.1.1"
-
-[[deps.ArrayInterface]]
-deps = ["Adapt", "LinearAlgebra", "SparseArrays", "SuiteSparse"]
-git-tree-sha1 = "133a240faec6e074e07c31ee75619c90544179cf"
-uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
-version = "7.10.0"
-
-    [deps.ArrayInterface.extensions]
-    ArrayInterfaceBandedMatricesExt = "BandedMatrices"
-    ArrayInterfaceBlockBandedMatricesExt = "BlockBandedMatrices"
-    ArrayInterfaceCUDAExt = "CUDA"
-    ArrayInterfaceCUDSSExt = "CUDSS"
-    ArrayInterfaceChainRulesExt = "ChainRules"
-    ArrayInterfaceGPUArraysCoreExt = "GPUArraysCore"
-    ArrayInterfaceReverseDiffExt = "ReverseDiff"
-    ArrayInterfaceStaticArraysCoreExt = "StaticArraysCore"
-    ArrayInterfaceTrackerExt = "Tracker"
-
-    [deps.ArrayInterface.weakdeps]
-    BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
-    BlockBandedMatrices = "ffab5731-97b5-5995-9138-79e8c1846df0"
-    CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-    CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
-    ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
-    GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
-    ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
-    StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
-    Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
-
-[[deps.Artifacts]]
-uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
-
-[[deps.Atomix]]
-deps = ["UnsafeAtomics"]
-git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
-uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
-version = "0.1.0"
-
-[[deps.BFloat16s]]
-deps = ["LinearAlgebra", "Printf", "Random", "Test"]
-git-tree-sha1 = "2c7cc21e8678eff479978a0a2ef5ce2f51b63dff"
-uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
-version = "0.5.0"
-
-[[deps.Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[deps.Blosc_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Lz4_jll", "Zlib_jll", "Zstd_jll"]
-git-tree-sha1 = "19b98ee7e3db3b4eff74c5c9c72bf32144e24f10"
-uuid = "0b7ba130-8d10-5ba8-a3d6-c5182647fed9"
-version = "1.21.5+0"
-
-[[deps.Bzip2_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "9e2a6b69137e6969bab0152632dcb3bc108c8bdd"
-uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
-version = "1.0.8+1"
-
-[[deps.CEnum]]
-git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc"
-uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.5.0"
-
-[[deps.CFTime]]
-deps = ["Dates", "Printf"]
-git-tree-sha1 = "5afb5c5ba2688ca43a9ad2e5a91cbb93921ccfa1"
-uuid = "179af706-886a-5703-950a-314cd64e0468"
-version = "0.1.3"
-
-[[deps.CUDA]]
-deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "StaticArrays", "Statistics"]
-git-tree-sha1 = "abc3c845165c2d5c03ab61754a90c7f7cff0f6a4"
-uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "5.4.1"
-
-    [deps.CUDA.extensions]
-    ChainRulesCoreExt = "ChainRulesCore"
-    EnzymeCoreExt = "EnzymeCore"
-    SpecialFunctionsExt = "SpecialFunctions"
-
-    [deps.CUDA.weakdeps]
-    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-    EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
-    SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
-
-[[deps.CUDA_Driver_jll]]
-deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
-git-tree-sha1 = "c48f9da18efd43b6b7adb7ee1f93fe5f2926c339"
-uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
-version = "0.9.0+0"
-
-[[deps.CUDA_Runtime_Discovery]]
-deps = ["Libdl"]
-git-tree-sha1 = "5db9da5fdeaa708c22ba86b82c49528f402497f2"
-uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
-version = "0.3.3"
-
-[[deps.CUDA_Runtime_jll]]
-deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
-git-tree-sha1 = "bcba305388e16aa5c879e896726db9e71b4942c6"
-uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
-version = "0.14.0+1"
-
-[[deps.ColorTypes]]
-deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "b10d0b65641d57b8b4d5e234446582de5047050d"
-uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.11.5"
-
-[[deps.Colors]]
-deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
-git-tree-sha1 = "362a287c3aa50601b0bc359053d5c2468f0e7ce0"
-uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.12.11"
-
-[[deps.CommonDataModel]]
-deps = ["CFTime", "DataStructures", "Dates", "Preferences", "Printf", "Statistics"]
-git-tree-sha1 = "d6fb5bf939a2753c74984b11434ea25d6c397a58"
-uuid = "1fbeeb36-5f17-413c-809b-666fb144f157"
-version = "0.3.6"
-
-[[deps.Compat]]
-deps = ["TOML", "UUIDs"]
-git-tree-sha1 = "b1c55339b7c6c350ee89f2c1604299660525b248"
-uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "4.15.0"
-weakdeps = ["Dates", "LinearAlgebra"]
-
-    [deps.Compat.extensions]
-    CompatLinearAlgebraExt = "LinearAlgebra"
-
-[[deps.CompilerSupportLibraries_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.1.1+0"
-
-[[deps.ConstructionBase]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "260fd2400ed2dab602a7c15cf10c1933c59930a2"
-uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
-version = "1.5.5"
-
-    [deps.ConstructionBase.extensions]
-    ConstructionBaseIntervalSetsExt = "IntervalSets"
-    ConstructionBaseStaticArraysExt = "StaticArrays"
-
-    [deps.ConstructionBase.weakdeps]
-    IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
-    StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-
-[[deps.Crayons]]
-git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
-uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
-version = "4.1.1"
-
-[[deps.CubedSphere]]
-deps = ["Elliptic", "FFTW", "Printf", "ProgressBars", "SpecialFunctions", "TaylorSeries", "Test"]
-git-tree-sha1 = "10134667d7d3569b191a65801514271b8a93b292"
-uuid = "7445602f-e544-4518-8976-18f8e8ae6cdb"
-version = "0.2.5"
-
-[[deps.DataAPI]]
-git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe"
-uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.16.0"
-
-[[deps.DataFrames]]
-deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
-git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
-uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-version = "1.6.1"
-
-[[deps.DataStructures]]
-deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "1d0a14036acb104d9e89698bd408f63ab58cdc82"
-uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.18.20"
-
-[[deps.DataValueInterfaces]]
-git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
-uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
-version = "1.0.0"
-
-[[deps.Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[deps.DiskArrays]]
-deps = ["LRUCache", "OffsetArrays"]
-git-tree-sha1 = "ef25c513cad08d7ebbed158c91768ae32f308336"
-uuid = "3c3547ce-8d99-4f5e-a174-61eb10b00ae3"
-version = "0.3.23"
-
-[[deps.Distances]]
-deps = ["LinearAlgebra", "Statistics", "StatsAPI"]
-git-tree-sha1 = "66c4c81f259586e8f002eacebc177e1fb06363b0"
-uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
-version = "0.10.11"
-
-    [deps.Distances.extensions]
-    DistancesChainRulesCoreExt = "ChainRulesCore"
-    DistancesSparseArraysExt = "SparseArrays"
-
-    [deps.Distances.weakdeps]
-    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-    SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-
-[[deps.Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
-uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-
-[[deps.DocStringExtensions]]
-deps = ["LibGit2"]
-git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
-uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-version = "0.9.3"
-
-[[deps.Downloads]]
-deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
-uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
-version = "1.6.0"
-
-[[deps.Elliptic]]
-git-tree-sha1 = "71c79e77221ab3a29918aaf6db4f217b89138608"
-uuid = "b305315f-e792-5b7a-8f41-49f472929428"
-version = "1.0.1"
-
-[[deps.ExprTools]]
-git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
-uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.10"
-
-[[deps.FFTW]]
-deps = ["AbstractFFTs", "FFTW_jll", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"]
-git-tree-sha1 = "4820348781ae578893311153d69049a93d05f39d"
-uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.8.0"
-
-[[deps.FFTW_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "c6033cc3892d0ef5bb9cd29b7f2f0331ea5184ea"
-uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
-version = "3.3.10+0"
-
-[[deps.FileIO]]
-deps = ["Pkg", "Requires", "UUIDs"]
-git-tree-sha1 = "82d8afa92ecf4b52d78d869f038ebfb881267322"
-uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
-version = "1.16.3"
-
-[[deps.FileWatching]]
-uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
-
-[[deps.FixedPointNumbers]]
-deps = ["Statistics"]
-git-tree-sha1 = "05882d6995ae5c12bb5f36dd2ed3f61c98cbb172"
-uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.8.5"
-
-[[deps.Future]]
-deps = ["Random"]
-uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
-
-[[deps.GMP_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "781609d7-10c4-51f6-84f2-b8444358ff6d"
-version = "6.2.1+6"
-
-[[deps.GPUArrays]]
-deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
-git-tree-sha1 = "38cb19b8a3e600e509dc36a6396ac74266d108c1"
-uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "10.1.1"
-
-[[deps.GPUArraysCore]]
-deps = ["Adapt"]
-git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950"
-uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
-version = "0.1.6"
-
-[[deps.GPUCompiler]]
-deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "518ebd058c9895de468a8c255797b0c53fdb44dd"
-uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.26.5"
-
-[[deps.Glob]]
-git-tree-sha1 = "97285bbd5230dd766e9ef6749b80fc617126d496"
-uuid = "c27321d9-0574-5035-807b-f59d2c89b15c"
-version = "1.3.1"
-
-[[deps.GnuTLS_jll]]
-deps = ["Artifacts", "GMP_jll", "JLLWrappers", "Libdl", "Nettle_jll", "P11Kit_jll", "Zlib_jll"]
-git-tree-sha1 = "383db7d3f900f4c1f47a8a04115b053c095e48d3"
-uuid = "0951126a-58fd-58f1-b5b3-b08c7c4a876d"
-version = "3.8.4+0"
-
-[[deps.HDF5_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "LibCURL_jll", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "OpenSSL_jll", "TOML", "Zlib_jll", "libaec_jll"]
-git-tree-sha1 = "82a471768b513dc39e471540fdadc84ff80ff997"
-uuid = "0234f1f7-429e-5d53-9886-15a909be8d59"
-version = "1.14.3+3"
-
-[[deps.Hwloc_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "ca0f6bf568b4bfc807e7537f081c81e35ceca114"
-uuid = "e33a78d0-f292-5ffc-b300-72abe9b543c8"
-version = "2.10.0+0"
-
-[[deps.IfElse]]
-git-tree-sha1 = "debdd00ffef04665ccbb3e150747a77560e8fad1"
-uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
-version = "0.1.1"
-
-[[deps.IncompleteLU]]
-deps = ["LinearAlgebra", "SparseArrays"]
-git-tree-sha1 = "6c676e79f98abb6d33fa28122cad099f1e464afe"
-uuid = "40713840-3770-5561-ab4c-a76e7d0d7895"
-version = "0.2.1"
-
-[[deps.InlineStrings]]
-deps = ["Parsers"]
-git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
-uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
-version = "1.4.0"
-
-[[deps.IntelOpenMP_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "be50fe8df3acbffa0274a744f1a99d29c45a57f4"
-uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
-version = "2024.1.0+0"
-
-[[deps.InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[deps.InvertedIndices]]
-git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
-uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
-version = "1.3.0"
-
-[[deps.IrrationalConstants]]
-git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
-uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
-version = "0.2.2"
-
-[[deps.IterativeSolvers]]
-deps = ["LinearAlgebra", "Printf", "Random", "RecipesBase", "SparseArrays"]
-git-tree-sha1 = "59545b0a2b27208b0650df0a46b8e3019f85055b"
-uuid = "42fd0dbc-a981-5370-80f2-aaf504508153"
-version = "0.9.4"
-
-[[deps.IteratorInterfaceExtensions]]
-git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
-uuid = "82899510-4779-5014-852e-03e436cf321d"
-version = "1.0.0"
-
-[[deps.JLD2]]
-deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "Pkg", "PrecompileTools", "Reexport", "Requires", "TranscodingStreams", "UUIDs", "Unicode"]
-git-tree-sha1 = "bdbe8222d2f5703ad6a7019277d149ec6d78c301"
-uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-version = "0.4.48"
-
-[[deps.JLLWrappers]]
-deps = ["Artifacts", "Preferences"]
-git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
-uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
-version = "1.5.0"
-
-[[deps.JSON3]]
-deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"]
-git-tree-sha1 = "eb3edce0ed4fa32f75a0a11217433c31d56bd48b"
-uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
-version = "1.14.0"
-
-    [deps.JSON3.extensions]
-    JSON3ArrowExt = ["ArrowTypes"]
-
-    [deps.JSON3.weakdeps]
-    ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
-
-[[deps.JuliaNVTXCallbacks_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
-uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
-version = "0.2.1+0"
-
-[[deps.KernelAbstractions]]
-deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "db02395e4c374030c53dc28f3c1d33dec35f7272"
-uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-version = "0.9.19"
-
-    [deps.KernelAbstractions.extensions]
-    EnzymeExt = "EnzymeCore"
-
-    [deps.KernelAbstractions.weakdeps]
-    EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
-
-[[deps.LLVM]]
-deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"]
-git-tree-sha1 = "065c36f95709dd4a676dc6839a35d6fa6f192f24"
-uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "7.1.0"
-weakdeps = ["BFloat16s"]
-
-    [deps.LLVM.extensions]
-    BFloat16sExt = "BFloat16s"
-
-[[deps.LLVMExtra_jll]]
-deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
-git-tree-sha1 = "88b916503aac4fb7f701bb625cd84ca5dd1677bc"
-uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.29+0"
-
-[[deps.LLVMLoopInfo]]
-git-tree-sha1 = "2e5c102cfc41f48ae4740c7eca7743cc7e7b75ea"
-uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586"
-version = "1.0.0"
-
-[[deps.LRUCache]]
-git-tree-sha1 = "b3cc6698599b10e652832c2f23db3cab99d51b59"
-uuid = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
-version = "1.6.1"
-weakdeps = ["Serialization"]
-
-    [deps.LRUCache.extensions]
-    SerializationExt = ["Serialization"]
-
-[[deps.LaTeXStrings]]
-git-tree-sha1 = "50901ebc375ed41dbf8058da26f9de442febbbec"
-uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
-version = "1.3.1"
-
-[[deps.LazyArtifacts]]
-deps = ["Artifacts", "Pkg"]
-uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
-
-[[deps.LibCURL]]
-deps = ["LibCURL_jll", "MozillaCACerts_jll"]
-uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
-version = "0.6.4"
-
-[[deps.LibCURL_jll]]
-deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
-uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
-version = "8.4.0+0"
-
-[[deps.LibGit2]]
-deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[deps.LibGit2_jll]]
-deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"]
-uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5"
-version = "1.6.4+0"
-
-[[deps.LibSSH2_jll]]
-deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
-uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
-version = "1.11.0+1"
-
-[[deps.Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[deps.Libiconv_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175"
-uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
-version = "1.17.0+0"
-
-[[deps.LinearAlgebra]]
-deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
-uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-
-[[deps.LogExpFunctions]]
-deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
-git-tree-sha1 = "18144f3e9cbe9b15b070288eef858f71b291ce37"
-uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
-version = "0.3.27"
-
-    [deps.LogExpFunctions.extensions]
-    LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
-    LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
-    LogExpFunctionsInverseFunctionsExt = "InverseFunctions"
-
-    [deps.LogExpFunctions.weakdeps]
-    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-    ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
-    InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
-
-[[deps.Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[deps.Lz4_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "6c26c5e8a4203d43b5497be3ec5d4e0c3cde240a"
-uuid = "5ced341a-0733-55b8-9ab6-a4889d929147"
-version = "1.9.4+0"
-
-[[deps.MKL_jll]]
-deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "oneTBB_jll"]
-git-tree-sha1 = "80b2833b56d466b3858d565adcd16a4a05f2089b"
-uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
-version = "2024.1.0+0"
-
-[[deps.MPI]]
-deps = ["Distributed", "DocStringExtensions", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "PkgVersion", "PrecompileTools", "Requires", "Serialization", "Sockets"]
-git-tree-sha1 = "4e3136db3735924f96632a5b40a5979f1f53fa07"
-uuid = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-version = "0.20.19"
-
-    [deps.MPI.extensions]
-    AMDGPUExt = "AMDGPU"
-    CUDAExt = "CUDA"
-
-    [deps.MPI.weakdeps]
-    AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-    CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-
-[[deps.MPICH_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "Hwloc_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
-git-tree-sha1 = "4099bb6809ac109bfc17d521dad33763bcf026b7"
-uuid = "7cb0a576-ebde-5e09-9194-50597f1243b4"
-version = "4.2.1+1"
-
-[[deps.MPIPreferences]]
-deps = ["Libdl", "Preferences"]
-git-tree-sha1 = "c105fe467859e7f6e9a852cb15cb4301126fac07"
-uuid = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
-version = "0.1.11"
-
-[[deps.MPItrampoline_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
-git-tree-sha1 = "8c35d5420193841b2f367e658540e8d9e0601ed0"
-uuid = "f1f71cc9-e9ae-5b93-9b94-4fe0e1ad3748"
-version = "5.4.0+0"
-
-[[deps.MacroTools]]
-deps = ["Markdown", "Random"]
-git-tree-sha1 = "2fa9ee3e63fd3a4f7a9a4f4744a52f4856de82df"
-uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.13"
-
-[[deps.Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[deps.MbedTLS_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-version = "2.28.2+1"
-
-[[deps.MicrosoftMPI_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "f12a29c4400ba812841c6ace3f4efbb6dbb3ba01"
-uuid = "9237b28f-5490-5468-be7b-bb81f5f5e6cf"
-version = "10.1.4+2"
-
-[[deps.Missings]]
-deps = ["DataAPI"]
-git-tree-sha1 = "ec4f7fbeab05d7747bdf98eb74d130a2a2ed298d"
-uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "1.2.0"
-
-[[deps.Mmap]]
-uuid = "a63ad114-7e13-5084-954f-fe012c677804"
-
-[[deps.MozillaCACerts_jll]]
-uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-version = "2023.1.10"
-
-[[deps.NCDatasets]]
-deps = ["CFTime", "CommonDataModel", "DataStructures", "Dates", "DiskArrays", "NetCDF_jll", "NetworkOptions", "Printf"]
-git-tree-sha1 = "a640912695952b074672edb5f9aaee2f7f9fd59a"
-uuid = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
-version = "0.14.4"
-
-[[deps.NVTX]]
-deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
-git-tree-sha1 = "53046f0483375e3ed78e49190f1154fa0a4083a1"
-uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
-version = "0.3.4"
-
-[[deps.NVTX_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
-uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
-version = "3.1.0+2"
-
-[[deps.NetCDF_jll]]
-deps = ["Artifacts", "Blosc_jll", "Bzip2_jll", "HDF5_jll", "JLLWrappers", "LazyArtifacts", "LibCURL_jll", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "TOML", "XML2_jll", "Zlib_jll", "Zstd_jll", "libzip_jll"]
-git-tree-sha1 = "4686378c4ae1d1948cfbe46c002a11a4265dcb07"
-uuid = "7243133f-43d8-5620-bbf4-c2c921802cf3"
-version = "400.902.211+1"
-
-[[deps.Nettle_jll]]
-deps = ["Artifacts", "GMP_jll", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "eca63e3847dad608cfa6a3329b95ef674c7160b4"
-uuid = "4c82536e-c426-54e4-b420-14f461c4ed8b"
-version = "3.7.2+0"
-
-[[deps.NetworkOptions]]
-uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
-version = "1.2.0"
-
-[[deps.OffsetArrays]]
-git-tree-sha1 = "e64b4f5ea6b7389f6f046d13d4896a8f9c1ba71e"
-uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
-version = "1.14.0"
-weakdeps = ["Adapt"]
-
-    [deps.OffsetArrays.extensions]
-    OffsetArraysAdaptExt = "Adapt"
-
-[[deps.OpenBLAS_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
-uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.23+4"
-
-[[deps.OpenLibm_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
-version = "0.8.1+2"
-
-[[deps.OpenMPI_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
-git-tree-sha1 = "e25c1778a98e34219a00455d6e4384e017ea9762"
-uuid = "fe0851c0-eecd-5654-98d4-656369965a5c"
-version = "4.1.6+0"
-
-[[deps.OpenSSL_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "3da7367955dcc5c54c1ba4d402ccdc09a1a3e046"
-uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
-version = "3.0.13+1"
-
-[[deps.OpenSpecFun_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
-uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.5+0"
-
-[[deps.OrderedCollections]]
-git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5"
-uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.6.3"
-
-[[deps.P11Kit_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "2cd396108e178f3ae8dedbd8e938a18726ab2fbf"
-uuid = "c2071276-7c44-58a7-b746-946036e04d0a"
-version = "0.24.1+0"
-
-[[deps.PackageExtensionCompat]]
-git-tree-sha1 = "fb28e33b8a95c4cee25ce296c817d89cc2e53518"
-uuid = "65ce6f38-6b18-4e1d-a461-8949797d7930"
-version = "1.0.2"
-weakdeps = ["Requires", "TOML"]
-
-[[deps.Parsers]]
-deps = ["Dates", "PrecompileTools", "UUIDs"]
-git-tree-sha1 = "8489905bcdbcfac64d1daa51ca07c0d8f0283821"
-uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.8.1"
-
-[[deps.PencilArrays]]
-deps = ["Adapt", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "StaticArrayInterface", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
-git-tree-sha1 = "6510e851700a851944f7ffa5cd990cced4802ad2"
-uuid = "0e08944d-e94e-41b1-9406-dcf66b6a9d2e"
-version = "0.19.3"
-
-    [deps.PencilArrays.extensions]
-    PencilArraysDiffEqExt = ["DiffEqBase"]
-    PencilArraysHDF5Ext = ["HDF5"]
-
-    [deps.PencilArrays.weakdeps]
-    DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
-    HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
-
-[[deps.PencilFFTs]]
-deps = ["AbstractFFTs", "FFTW", "LinearAlgebra", "MPI", "PencilArrays", "Reexport", "TimerOutputs"]
-git-tree-sha1 = "bd69f3f0ee248cfb4241800aefb705b5ded592ff"
-uuid = "4a48f351-57a6-4416-9ec4-c37015456aae"
-version = "0.15.1"
-
-[[deps.Pkg]]
-deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
-uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.10.0"
-
-[[deps.PkgVersion]]
-deps = ["Pkg"]
-git-tree-sha1 = "f9501cc0430a26bc3d156ae1b5b0c1b47af4d6da"
-uuid = "eebad327-c553-4316-9ea0-9fa01ccd7688"
-version = "0.3.3"
-
-[[deps.PooledArrays]]
-deps = ["DataAPI", "Future"]
-git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
-uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
-version = "1.4.3"
-
-[[deps.PrecompileTools]]
-deps = ["Preferences"]
-git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f"
-uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
-version = "1.2.1"
-
-[[deps.Preferences]]
-deps = ["TOML"]
-git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6"
-uuid = "21216c6a-2e73-6563-6e65-726566657250"
-version = "1.4.3"
-
-[[deps.PrettyTables]]
-deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"]
-git-tree-sha1 = "66b20dd35966a748321d3b2537c4584cf40387c7"
-uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
-version = "2.3.2"
-
-[[deps.Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[deps.ProgressBars]]
-deps = ["Printf"]
-git-tree-sha1 = "b437cdb0385ed38312d91d9c00c20f3798b30256"
-uuid = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
-version = "1.5.1"
-
-[[deps.Quaternions]]
-deps = ["LinearAlgebra", "Random", "RealDot"]
-git-tree-sha1 = "994cc27cdacca10e68feb291673ec3a76aa2fae9"
-uuid = "94ee1d12-ae83-5a48-8b1c-48b8ff168ae0"
-version = "0.7.6"
-
-[[deps.REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[deps.Random]]
-deps = ["SHA"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[deps.Random123]]
-deps = ["Random", "RandomNumbers"]
-git-tree-sha1 = "4743b43e5a9c4a2ede372de7061eed81795b12e7"
-uuid = "74087812-796a-5b5d-8853-05524746bad3"
-version = "1.7.0"
-
-[[deps.RandomNumbers]]
-deps = ["Random", "Requires"]
-git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
-uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
-version = "1.5.3"
-
-[[deps.RealDot]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "9f0a1b71baaf7650f4fa8a1d168c7fb6ee41f0c9"
-uuid = "c1ae055f-0cd5-4b69-90a6-9a35b1a98df9"
-version = "0.1.0"
-
-[[deps.RecipesBase]]
-deps = ["PrecompileTools"]
-git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff"
-uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
-version = "1.3.4"
-
-[[deps.Reexport]]
-git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
-uuid = "189a3867-3050-52da-a836-e630ba90ab69"
-version = "1.2.2"
-
-[[deps.Requires]]
-deps = ["UUIDs"]
-git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
-uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "1.3.0"
-
-[[deps.Rotations]]
-deps = ["LinearAlgebra", "Quaternions", "Random", "StaticArrays"]
-git-tree-sha1 = "5680a9276685d392c87407df00d57c9924d9f11e"
-uuid = "6038ab10-8711-5258-84ad-4b1120ba62dc"
-version = "1.7.1"
-weakdeps = ["RecipesBase"]
-
-    [deps.Rotations.extensions]
-    RotationsRecipesBaseExt = "RecipesBase"
-
-[[deps.SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-version = "0.7.0"
-
-[[deps.Scratch]]
-deps = ["Dates"]
-git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386"
-uuid = "6c6a2e73-6563-6170-7368-637461726353"
-version = "1.2.1"
-
-[[deps.SeawaterPolynomials]]
-git-tree-sha1 = "6d85acd6de472f8e6da81c61c7c5b6280a55e0bc"
-uuid = "d496a93d-167e-4197-9f49-d3af4ff8fe40"
-version = "0.3.4"
-
-[[deps.SentinelArrays]]
-deps = ["Dates", "Random"]
-git-tree-sha1 = "90b4f68892337554d31cdcdbe19e48989f26c7e6"
-uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
-version = "1.4.3"
-
-[[deps.Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[deps.Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[deps.SortingAlgorithms]]
-deps = ["DataStructures"]
-git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085"
-uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
-version = "1.2.1"
-
-[[deps.SparseArrays]]
-deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
-uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-version = "1.10.0"
-
-[[deps.SpecialFunctions]]
-deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
-git-tree-sha1 = "2f5d4697f21388cbe1ff299430dd169ef97d7e14"
-uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "2.4.0"
-
-    [deps.SpecialFunctions.extensions]
-    SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
-
-    [deps.SpecialFunctions.weakdeps]
-    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-
-[[deps.Static]]
-deps = ["IfElse"]
-git-tree-sha1 = "d2fdac9ff3906e27f7a618d47b676941baa6c80c"
-uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
-version = "0.8.10"
-
-[[deps.StaticArrayInterface]]
-deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "PrecompileTools", "Requires", "SparseArrays", "Static", "SuiteSparse"]
-git-tree-sha1 = "5d66818a39bb04bf328e92bc933ec5b4ee88e436"
-uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718"
-version = "1.5.0"
-weakdeps = ["OffsetArrays", "StaticArrays"]
-
-    [deps.StaticArrayInterface.extensions]
-    StaticArrayInterfaceOffsetArraysExt = "OffsetArrays"
-    StaticArrayInterfaceStaticArraysExt = "StaticArrays"
-
-[[deps.StaticArrays]]
-deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"]
-git-tree-sha1 = "9ae599cd7529cfce7fea36cf00a62cfc56f0f37c"
-uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.9.4"
-
-    [deps.StaticArrays.extensions]
-    StaticArraysChainRulesCoreExt = "ChainRulesCore"
-    StaticArraysStatisticsExt = "Statistics"
-
-    [deps.StaticArrays.weakdeps]
-    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-    Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-
-[[deps.StaticArraysCore]]
-git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
-uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
-version = "1.4.2"
-
-[[deps.StaticPermutations]]
-git-tree-sha1 = "193c3daa18ff3e55c1dae66acb6a762c4a3bdb0b"
-uuid = "15972242-4b8f-49a0-b8a1-9ac0e7a1a45d"
-version = "0.3.0"
-
-[[deps.Statistics]]
-deps = ["LinearAlgebra", "SparseArrays"]
-uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-version = "1.10.0"
-
-[[deps.StatsAPI]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "1ff449ad350c9c4cbc756624d6f8a8c3ef56d3ed"
-uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
-version = "1.7.0"
-
-[[deps.Strided]]
-deps = ["LinearAlgebra", "StridedViews", "TupleTools"]
-git-tree-sha1 = "40c69be0e1b72ee2f42923b7d1ff13e0b04e675c"
-uuid = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
-version = "2.0.4"
-
-[[deps.StridedViews]]
-deps = ["LinearAlgebra", "PackageExtensionCompat"]
-git-tree-sha1 = "5b765c4e401693ab08981989f74a36a010aa1d8e"
-uuid = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
-version = "0.2.2"
-weakdeps = ["CUDA"]
-
-    [deps.StridedViews.extensions]
-    StridedViewsCUDAExt = "CUDA"
-
-[[deps.StringManipulation]]
-deps = ["PrecompileTools"]
-git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
-uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
-version = "0.3.4"
-
-[[deps.StructArrays]]
-deps = ["ConstructionBase", "DataAPI", "Tables"]
-git-tree-sha1 = "f4dc295e983502292c4c3f951dbb4e985e35b3be"
-uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
-version = "0.6.18"
-weakdeps = ["Adapt", "GPUArraysCore", "SparseArrays", "StaticArrays"]
-
-    [deps.StructArrays.extensions]
-    StructArraysAdaptExt = "Adapt"
-    StructArraysGPUArraysCoreExt = "GPUArraysCore"
-    StructArraysSparseArraysExt = "SparseArrays"
-    StructArraysStaticArraysExt = "StaticArrays"
-
-[[deps.StructTypes]]
-deps = ["Dates", "UUIDs"]
-git-tree-sha1 = "ca4bccb03acf9faaf4137a9abc1881ed1841aa70"
-uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
-version = "1.10.0"
-
-[[deps.SuiteSparse]]
-deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
-uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
-
-[[deps.SuiteSparse_jll]]
-deps = ["Artifacts", "Libdl", "libblastrampoline_jll"]
-uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
-version = "7.2.1+1"
-
-[[deps.TOML]]
-deps = ["Dates"]
-uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
-version = "1.0.3"
-
-[[deps.TableTraits]]
-deps = ["IteratorInterfaceExtensions"]
-git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
-uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
-version = "1.0.1"
-
-[[deps.Tables]]
-deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"]
-git-tree-sha1 = "cb76cf677714c095e535e3501ac7954732aeea2d"
-uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
-version = "1.11.1"
-
-[[deps.Tar]]
-deps = ["ArgTools", "SHA"]
-uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-version = "1.10.0"
-
-[[deps.TaylorSeries]]
-deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"]
-git-tree-sha1 = "1c7170668366821b0c4c4fe03ee78f8d6cf36e2c"
-uuid = "6aa5eb33-94cf-58f4-a9d0-e4b2c4fc25ea"
-version = "0.16.0"
-
-    [deps.TaylorSeries.extensions]
-    TaylorSeriesIAExt = "IntervalArithmetic"
-
-    [deps.TaylorSeries.weakdeps]
-    IntervalArithmetic = "d1acc4aa-44c8-5952-acd4-ba5d80a2a253"
-
-[[deps.Test]]
-deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
-uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[[deps.TimerOutputs]]
-deps = ["ExprTools", "Printf"]
-git-tree-sha1 = "5a13ae8a41237cff5ecf34f73eb1b8f42fff6531"
-uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.24"
-
-[[deps.TranscodingStreams]]
-git-tree-sha1 = "5d54d076465da49d6746c647022f3b3674e64156"
-uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
-version = "0.10.8"
-weakdeps = ["Random", "Test"]
-
-    [deps.TranscodingStreams.extensions]
-    TestExt = ["Test", "Random"]
-
-[[deps.TupleTools]]
-git-tree-sha1 = "41d61b1c545b06279871ef1a4b5fcb2cac2191cd"
-uuid = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
-version = "1.5.0"
-
-[[deps.UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-
-[[deps.Unicode]]
-uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
-
-[[deps.UnsafeAtomics]]
-git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
-uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
-version = "0.2.1"
-
-[[deps.UnsafeAtomicsLLVM]]
-deps = ["LLVM", "UnsafeAtomics"]
-git-tree-sha1 = "d9f5962fecd5ccece07db1ff006fb0b5271bdfdd"
-uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
-version = "0.1.4"
-
-[[deps.VersionParsing]]
-git-tree-sha1 = "58d6e80b4ee071f5efd07fda82cb9fbe17200868"
-uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
-version = "1.3.0"
-
-[[deps.XML2_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"]
-git-tree-sha1 = "52ff2af32e591541550bd753c0da8b9bc92bb9d9"
-uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
-version = "2.12.7+0"
-
-[[deps.XZ_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "ac88fb95ae6447c8dda6a5503f3bafd496ae8632"
-uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
-version = "5.4.6+0"
-
-[[deps.Zlib_jll]]
-deps = ["Libdl"]
-uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.13+1"
-
-[[deps.Zstd_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "e678132f07ddb5bfa46857f0d7620fb9be675d3b"
-uuid = "3161d3a3-bdf6-5164-811a-617609db77b4"
-version = "1.5.6+0"
-
-[[deps.libaec_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "46bf7be2917b59b761247be3f317ddf75e50e997"
-uuid = "477f73a3-ac25-53e9-8cc3-50b2fa2566f0"
-version = "1.1.2+0"
-
-[[deps.libblastrampoline_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
-version = "5.8.0+1"
-
-[[deps.libzip_jll]]
-deps = ["Artifacts", "Bzip2_jll", "GnuTLS_jll", "JLLWrappers", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"]
-git-tree-sha1 = "3282b7d16ae7ac3e57ec2f3fa8fafb564d8f9f7f"
-uuid = "337d8026-41b4-5cde-a456-74a10e5b31d1"
-version = "1.10.1+0"
-
-[[deps.nghttp2_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
-version = "1.52.0+1"
-
-[[deps.oneTBB_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "7d0ea0f4895ef2f5cb83645fa689e52cb55cf493"
-uuid = "1317d2d5-d96f-522e-a858-c73665f53c3e"
-version = "2021.12.0+0"
-
-[[deps.p7zip_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
-version = "17.4.0+2"
diff --git a/Project.toml b/Project.toml
index c56b10d0c9..dcbc5183e6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -17,6 +17,7 @@ IncompleteLU = "40713840-3770-5561-ab4c-a76e7d0d7895"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
@@ -45,7 +46,6 @@ OceananigansEnzymeExt = "Enzyme"
 Adapt = "3, 4"
 CUDA = "4.1.1, 5"
 Crayons = "4"
-CubedSphere = "0.1, 0.2"
 Dates = "1.9"
 Distances = "0.10"
 DocStringExtensions = "0.8, 0.9"
@@ -90,6 +90,4 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 TimesDates = "bdfc003b-8df8-5c39-adcd-3a9087f5df4a"
 
 [targets]
-test = ["BenchmarkTools", "Coverage", "CUDA_Runtime_jll", "DataDeps", "Enzyme",
-        "InteractiveUtils", "MPIPreferences", "OpenMPI_jll", "Test", "TimerOutputs",
-        "TimesDates", "SafeTestsets"]
+test = ["BenchmarkTools", "Coverage", "CUDA_Runtime_jll", "DataDeps", "Enzyme", "InteractiveUtils", "MPIPreferences", "OpenMPI_jll", "Test", "TimerOutputs", "TimesDates", "SafeTestsets"]
diff --git a/README.md b/README.md
index 6abd721396..699575ee0a 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,19 @@
   </a>
 </p>
 
+<!-- Downloads -->
+<!-- counts downloads from individual IPs excluding bots (eg, CI) -->
+<!-- see https://discourse.julialang.org/t/announcing-package-download-stats/69073 -->
+
+<p align="center">
+  <a href="https://juliapkgstats.com/pkg/Oceananigans">
+    <img alt="Downloads per month" src="https://img.shields.io/badge/dynamic/json?url=http%3A%2F%2Fjuliapkgstats.com%2Fapi%2Fv1%2Fmonthly_downloads%2FOceananigans&query=total_requests&suffix=%2Fmonth&label=Downloads&style=flat-square">
+  </a>
+  <a href="https://juliapkgstats.com/pkg/Oceananigans">
+    <img alt="Downloads per month" src="https://img.shields.io/badge/dynamic/json?url=http%3A%2F%2Fjuliapkgstats.com%2Fapi%2Fv1%2Ftotal_downloads%2FOceananigans&query=total_requests&&label=Total%20Downloads&style=flat-square">
+  </a>
+</p>
+
 <!-- CI/CD badges -->
 <p align="center">
   <a href="https://buildkite.com/clima/oceananigans">
@@ -59,13 +72,18 @@ Oceananigans.jl is developed by the [Climate Modeling Alliance](https://clima.ca
 
 ## Contents
 
-* [Installation instructions](#installation-instructions)
-* [Running your first model](#running-your-first-model)
-* [The Oceananigans knowledge base](#the-oceananigans-knowledge-base)
-* [Citing](#citing)
-* [Contributing](#contributing)
-* [Movies](#movies)
-* [Performance benchmarks](#performance-benchmarks)
+- [Contents](#contents)
+- [Installation instructions](#installation-instructions)
+- [Running your first model](#running-your-first-model)
+- [The Oceananigans knowledge base](#the-oceananigans-knowledge-base)
+- [Citing](#citing)
+- [Contributing](#contributing)
+- [Movies](#movies)
+  - [Deep convection](#deep-convection)
+  - [Free convection](#free-convection)
+  - [Winds blowing over the ocean](#winds-blowing-over-the-ocean)
+  - [Free convection with wind stress](#free-convection-with-wind-stress)
+- [Performance benchmarks](#performance-benchmarks)
 
 ## Installation instructions
 
@@ -102,7 +120,7 @@ simulation = Simulation(model; Δt=0.01, stop_time=4)
 run!(simulation)
 ```
 
-But there's more: changing `CPU()` to `GPU()` makes this code on a CUDA-enabled Nvidia GPU.
+But there's more: changing `CPU()` to `GPU()` makes this code run on a CUDA-enabled Nvidia GPU.
 
 Dive into [the documentation](https://clima.github.io/OceananigansDocumentation/stable/) for more code examples and tutorials.
 Below, you'll find movies from GPU simulations along with CPU and GPU [performance benchmarks](https://github.com/clima/Oceananigans.jl#performance-benchmarks).
diff --git a/benchmark/Manifest.toml b/benchmark/Manifest.toml
index 144f0dcf75..4b54e33a93 100644
--- a/benchmark/Manifest.toml
+++ b/benchmark/Manifest.toml
@@ -1,1311 +1,1886 @@
 # This file is machine-generated - editing it directly is not advised
 
-[[AbstractFFTs]]
-deps = ["ChainRulesCore", "LinearAlgebra"]
-git-tree-sha1 = "6f1d9bc1c08f9f4a8fa92e3ea3cb50153a1b40d4"
+julia_version = "1.10.4"
+manifest_format = "2.0"
+project_hash = "04609fb9ae45a4a279fa18468ec20a5fba4d07b8"
+
+[[deps.AbstractFFTs]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "1.1.0"
+version = "1.5.0"
 
-[[AbstractTrees]]
-git-tree-sha1 = "03e0550477d86222521d254b741d470ba17ea0b5"
+    [deps.AbstractFFTs.extensions]
+    AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
+    AbstractFFTsTestExt = "Test"
+
+    [deps.AbstractFFTs.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[deps.AbstractTrees]]
+git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-version = "0.3.4"
+version = "0.4.5"
 
-[[Adapt]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "af92965fb30777147966f58acb05da51c5616b5f"
+[[deps.Adapt]]
+deps = ["LinearAlgebra", "Requires"]
+git-tree-sha1 = "6a55b747d1812e699320963ffde36f1ebdda4099"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "3.3.3"
+version = "4.0.4"
+weakdeps = ["StaticArrays"]
 
-[[ArgTools]]
+    [deps.Adapt.extensions]
+    AdaptStaticArraysExt = "StaticArrays"
+
+[[deps.ArgTools]]
 uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
+version = "1.1.1"
 
-[[ArrayInterface]]
-deps = ["Compat", "IfElse", "LinearAlgebra", "Requires", "SparseArrays", "Static"]
-git-tree-sha1 = "c933ce606f6535a7c7b98e1d86d5d1014f730596"
+[[deps.ArrayInterface]]
+deps = ["Adapt", "LinearAlgebra", "SparseArrays", "SuiteSparse"]
+git-tree-sha1 = "5c9b74c973181571deb6442d41e5c902e6b9f38e"
 uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
-version = "5.0.7"
-
-[[Artifacts]]
+version = "7.12.0"
+
+    [deps.ArrayInterface.extensions]
+    ArrayInterfaceBandedMatricesExt = "BandedMatrices"
+    ArrayInterfaceBlockBandedMatricesExt = "BlockBandedMatrices"
+    ArrayInterfaceCUDAExt = "CUDA"
+    ArrayInterfaceCUDSSExt = "CUDSS"
+    ArrayInterfaceChainRulesExt = "ChainRules"
+    ArrayInterfaceGPUArraysCoreExt = "GPUArraysCore"
+    ArrayInterfaceReverseDiffExt = "ReverseDiff"
+    ArrayInterfaceStaticArraysCoreExt = "StaticArraysCore"
+    ArrayInterfaceTrackerExt = "Tracker"
+
+    [deps.ArrayInterface.weakdeps]
+    BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
+    BlockBandedMatrices = "ffab5731-97b5-5995-9138-79e8c1846df0"
+    CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+    CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
+    ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
+    GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+    ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
+    StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
+    Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+
+[[deps.Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 
-[[BFloat16s]]
+[[deps.Atomix]]
+deps = ["UnsafeAtomics"]
+git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
+uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+version = "0.1.0"
+
+[[deps.BFloat16s]]
 deps = ["LinearAlgebra", "Printf", "Random", "Test"]
-git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072"
+git-tree-sha1 = "2c7cc21e8678eff479978a0a2ef5ce2f51b63dff"
 uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
-version = "0.2.0"
+version = "0.5.0"
 
-[[BSON]]
-git-tree-sha1 = "306bb5574b0c1c56d7e1207581516c557d105cad"
+[[deps.BSON]]
+git-tree-sha1 = "4c3e506685c527ac6a54ccc0c8c76fd6f91b42fb"
 uuid = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
-version = "0.3.5"
+version = "0.3.9"
 
-[[Base64]]
+[[deps.Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
-[[BenchmarkTools]]
+[[deps.BenchmarkTools]]
 deps = ["JSON", "Logging", "Printf", "Profile", "Statistics", "UUIDs"]
-git-tree-sha1 = "4c10eee4af024676200bc7752e536f858c6b8f93"
+git-tree-sha1 = "f1dff6729bc61f4d49e140da1af55dcd1ac97b2f"
 uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-version = "1.3.1"
+version = "1.5.0"
+
+[[deps.BitFlags]]
+git-tree-sha1 = "0691e34b3bb8be9307330f88d1a3c3f25466c24d"
+uuid = "d1d4a3ce-64b1-5f1a-9ba4-7e7e69966f35"
+version = "0.1.9"
+
+[[deps.Blosc_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Lz4_jll", "Zlib_jll", "Zstd_jll"]
+git-tree-sha1 = "19b98ee7e3db3b4eff74c5c9c72bf32144e24f10"
+uuid = "0b7ba130-8d10-5ba8-a3d6-c5182647fed9"
+version = "1.21.5+0"
 
-[[Bzip2_jll]]
+[[deps.Bzip2_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
+git-tree-sha1 = "9e2a6b69137e6969bab0152632dcb3bc108c8bdd"
 uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
-version = "1.0.8+0"
+version = "1.0.8+1"
 
-[[CEnum]]
-git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
+[[deps.CEnum]]
+git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.4.2"
+version = "0.5.0"
 
-[[CFTime]]
+[[deps.CFTime]]
 deps = ["Dates", "Printf"]
-git-tree-sha1 = "ed2e76c1c3c43fd9d0cb9248674620b29d71f2d1"
+git-tree-sha1 = "5afb5c5ba2688ca43a9ad2e5a91cbb93921ccfa1"
 uuid = "179af706-886a-5703-950a-314cd64e0468"
-version = "0.1.2"
+version = "0.1.3"
 
-[[CUDA]]
-deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
-git-tree-sha1 = "ba75320aaa092b3e17c020a2d8b9e0a572dbfa6a"
+[[deps.CUDA]]
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "StaticArrays", "Statistics"]
+git-tree-sha1 = "fdd9dfb67dfefd548f51000cc400bb51003de247"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "3.9.0"
+version = "5.4.3"
 
-[[CUDAKernels]]
-deps = ["Adapt", "CUDA", "KernelAbstractions", "StaticArrays"]
-git-tree-sha1 = "9a74283666ff859473754c9367cc88202e873eac"
-uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
-version = "0.4.1"
+    [deps.CUDA.extensions]
+    ChainRulesCoreExt = "ChainRulesCore"
+    EnzymeCoreExt = "EnzymeCore"
+    SpecialFunctionsExt = "SpecialFunctions"
+
+    [deps.CUDA.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+    SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+
+[[deps.CUDA_Driver_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
+git-tree-sha1 = "97df9d4d6be8ac6270cb8fd3b8fc413690820cbd"
+uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
+version = "0.9.1+1"
+
+[[deps.CUDA_Runtime_Discovery]]
+deps = ["Libdl"]
+git-tree-sha1 = "f3b237289a5a77c759b2dd5d4c2ff641d67c4030"
+uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
+version = "0.3.4"
 
-[[Cairo_jll]]
-deps = ["Artifacts", "Bzip2_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"]
-git-tree-sha1 = "4b859a208b2397a7a623a03449e4636bdb17bcf2"
+[[deps.CUDA_Runtime_jll]]
+deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "afea94249b821dc754a8ca6695d3daed851e1f5a"
+uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
+version = "0.14.1+0"
+
+[[deps.Cairo_jll]]
+deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"]
+git-tree-sha1 = "a2f1c8c668c8e3cb4cca4e57a8efdb09067bb3fd"
 uuid = "83423d85-b0ee-5818-9007-b63ccbeb887a"
-version = "1.16.1+1"
+version = "1.18.0+2"
 
-[[Calculus]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "f641eb0a4f00c343bbc32346e1217b86f3ce9dad"
-uuid = "49dc2e85-a5d0-5ad3-a950-438e2897f1b9"
-version = "0.5.1"
-
-[[ChainRulesCore]]
-deps = ["Compat", "LinearAlgebra", "SparseArrays"]
-git-tree-sha1 = "9950387274246d08af38f6eef8cb5480862a435f"
-uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-version = "1.14.0"
-
-[[ChangesOfVariables]]
-deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
-git-tree-sha1 = "bf98fa45a0a4cee295de98d4c1462be26345b9a1"
-uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
-version = "0.1.2"
-
-[[ColorSchemes]]
-deps = ["ColorTypes", "Colors", "FixedPointNumbers", "Random"]
-git-tree-sha1 = "12fc73e5e0af68ad3137b886e3f7c1eacfca2640"
+[[deps.CodecZlib]]
+deps = ["TranscodingStreams", "Zlib_jll"]
+git-tree-sha1 = "b8fe8546d52ca154ac556809e10c75e6e7430ac8"
+uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
+version = "0.7.5"
+
+[[deps.ColorSchemes]]
+deps = ["ColorTypes", "ColorVectorSpace", "Colors", "FixedPointNumbers", "PrecompileTools", "Random"]
+git-tree-sha1 = "b5278586822443594ff615963b0c09755771b3e0"
 uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
-version = "3.17.1"
+version = "3.26.0"
 
-[[ColorTypes]]
+[[deps.ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
-git-tree-sha1 = "024fe24d83e4a5bf5fc80501a314ce0d1aa35597"
+git-tree-sha1 = "b10d0b65641d57b8b4d5e234446582de5047050d"
 uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-version = "0.11.0"
+version = "0.11.5"
+
+[[deps.ColorVectorSpace]]
+deps = ["ColorTypes", "FixedPointNumbers", "LinearAlgebra", "Requires", "Statistics", "TensorCore"]
+git-tree-sha1 = "a1f44953f2382ebb937d60dafbe2deea4bd23249"
+uuid = "c3611d14-8923-5661-9e6a-0046d554d3a4"
+version = "0.10.0"
+weakdeps = ["SpecialFunctions"]
 
-[[Colors]]
+    [deps.ColorVectorSpace.extensions]
+    SpecialFunctionsExt = "SpecialFunctions"
+
+[[deps.Colors]]
 deps = ["ColorTypes", "FixedPointNumbers", "Reexport"]
-git-tree-sha1 = "417b0ed7b8b838aa6ca0a87aadf1bb9eb111ce40"
+git-tree-sha1 = "362a287c3aa50601b0bc359053d5c2468f0e7ce0"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.12.8"
+version = "0.12.11"
+
+[[deps.CommonDataModel]]
+deps = ["CFTime", "DataStructures", "Dates", "Preferences", "Printf", "Statistics"]
+git-tree-sha1 = "d6fb5bf939a2753c74984b11434ea25d6c397a58"
+uuid = "1fbeeb36-5f17-413c-809b-666fb144f157"
+version = "0.3.6"
 
-[[Compat]]
-deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
-git-tree-sha1 = "b153278a25dd42c65abbf4e62344f9d22e59191b"
+[[deps.CommonWorldInvalidations]]
+git-tree-sha1 = "ae52d1c52048455e85a387fbee9be553ec2b68d0"
+uuid = "f70d9fcc-98c5-4d4a-abd7-e4cdeebd8ca8"
+version = "1.0.0"
+
+[[deps.Compat]]
+deps = ["TOML", "UUIDs"]
+git-tree-sha1 = "b1c55339b7c6c350ee89f2c1604299660525b248"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "3.43.0"
+version = "4.15.0"
+weakdeps = ["Dates", "LinearAlgebra"]
+
+    [deps.Compat.extensions]
+    CompatLinearAlgebraExt = "LinearAlgebra"
 
-[[CompilerSupportLibraries_jll]]
+[[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+version = "1.1.1+0"
 
-[[Conda]]
+[[deps.ConcurrentUtilities]]
+deps = ["Serialization", "Sockets"]
+git-tree-sha1 = "ea32b83ca4fefa1768dc84e504cc0a94fb1ab8d1"
+uuid = "f0e56b4a-5159-44fe-b623-3e5288b988bb"
+version = "2.4.2"
+
+[[deps.Conda]]
 deps = ["Downloads", "JSON", "VersionParsing"]
-git-tree-sha1 = "6e47d11ea2776bc5627421d59cdcc1296c058071"
+git-tree-sha1 = "b19db3927f0db4151cb86d073689f2428e524576"
 uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
-version = "1.7.0"
+version = "1.10.2"
+
+[[deps.ConstructionBase]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "d8a9c0b6ac2d9081bf76324b39c78ca3ce4f0c98"
+uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
+version = "1.5.6"
+
+    [deps.ConstructionBase.extensions]
+    ConstructionBaseIntervalSetsExt = "IntervalSets"
+    ConstructionBaseStaticArraysExt = "StaticArrays"
+
+    [deps.ConstructionBase.weakdeps]
+    IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
+    StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
-[[Contour]]
-deps = ["StaticArrays"]
-git-tree-sha1 = "9f02045d934dc030edad45944ea80dbd1f0ebea7"
+[[deps.Contour]]
+git-tree-sha1 = "439e35b0b36e2e5881738abc8857bd92ad6ff9a8"
 uuid = "d38c429a-6771-53c6-b99e-75d170b6e991"
-version = "0.5.7"
+version = "0.6.3"
 
-[[Crayons]]
+[[deps.Crayons]]
 git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
 uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
 version = "4.1.1"
 
-[[CubedSphere]]
-deps = ["Elliptic", "Printf", "Rotations", "TaylorSeries", "Test"]
-git-tree-sha1 = "f66fabd1ee5df59a7ba47c7873a6332c19e0c03f"
+[[deps.CubedSphere]]
+deps = ["Elliptic", "FFTW", "Printf", "ProgressBars", "SpecialFunctions", "TaylorSeries", "Test"]
+git-tree-sha1 = "10134667d7d3569b191a65801514271b8a93b292"
 uuid = "7445602f-e544-4518-8976-18f8e8ae6cdb"
-version = "0.2.0"
+version = "0.2.5"
 
-[[DataAPI]]
-git-tree-sha1 = "fb5f5316dd3fd4c5e7c30a24d50643b73e37cd40"
+[[deps.DataAPI]]
+git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe"
 uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.10.0"
+version = "1.16.0"
 
-[[DataFrames]]
-deps = ["Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
-git-tree-sha1 = "6c19003824cbebd804a51211fd3bbd81bf1ecad5"
+[[deps.DataFrames]]
+deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
+git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
 uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-version = "1.3.3"
+version = "1.6.1"
 
-[[DataStructures]]
+[[deps.DataStructures]]
 deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "3daef5523dd2e769dad2365274f760ff5f282c7d"
+git-tree-sha1 = "1d0a14036acb104d9e89698bd408f63ab58cdc82"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.18.11"
+version = "0.18.20"
 
-[[DataValueInterfaces]]
+[[deps.DataValueInterfaces]]
 git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
 uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
 version = "1.0.0"
 
-[[Dates]]
+[[deps.Dates]]
 deps = ["Printf"]
 uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
 
-[[DelimitedFiles]]
+[[deps.DelimitedFiles]]
 deps = ["Mmap"]
+git-tree-sha1 = "9e2f36d3c96a820c678f2f1f1782582fcf685bae"
 uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+version = "1.9.1"
+
+[[deps.DiskArrays]]
+deps = ["LRUCache", "OffsetArrays"]
+git-tree-sha1 = "ef25c513cad08d7ebbed158c91768ae32f308336"
+uuid = "3c3547ce-8d99-4f5e-a174-61eb10b00ae3"
+version = "0.3.23"
 
-[[DiffRules]]
-deps = ["IrrationalConstants", "LogExpFunctions", "NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "28d605d9a0ac17118fe2c5e9ce0fbb76c3ceb120"
-uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "1.11.0"
+[[deps.Distances]]
+deps = ["LinearAlgebra", "Statistics", "StatsAPI"]
+git-tree-sha1 = "66c4c81f259586e8f002eacebc177e1fb06363b0"
+uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+version = "0.10.11"
 
-[[Distributed]]
+    [deps.Distances.extensions]
+    DistancesChainRulesCoreExt = "ChainRulesCore"
+    DistancesSparseArraysExt = "SparseArrays"
+
+    [deps.Distances.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[[deps.Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 
-[[DocStringExtensions]]
+[[deps.DocStringExtensions]]
 deps = ["LibGit2"]
-git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b"
+git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
 uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-version = "0.8.6"
+version = "0.9.3"
 
-[[Downloads]]
-deps = ["ArgTools", "LibCURL", "NetworkOptions"]
+[[deps.Downloads]]
+deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
 uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+version = "1.6.0"
 
-[[DualNumbers]]
-deps = ["Calculus", "NaNMath", "SpecialFunctions"]
-git-tree-sha1 = "5837a837389fccf076445fce071c8ddaea35a566"
-uuid = "fa6b7ba4-c1ee-5f82-b5fc-ecf0adba8f74"
-version = "0.6.8"
-
-[[EarCut_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "3f3a2501fa7236e9b911e0f7a588c657e822bb6d"
-uuid = "5ae413db-bbd1-5e63-b57d-d24a61df00f5"
-version = "2.2.3+0"
-
-[[Elliptic]]
+[[deps.Elliptic]]
 git-tree-sha1 = "71c79e77221ab3a29918aaf6db4f217b89138608"
 uuid = "b305315f-e792-5b7a-8f41-49f472929428"
 version = "1.0.1"
 
-[[Expat_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "bad72f730e9e91c08d9427d5e8db95478a3c323d"
+[[deps.EpollShim_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "8e9441ee83492030ace98f9789a654a6d0b1f643"
+uuid = "2702e6a9-849d-5ed8-8c21-79e8b8f9ee43"
+version = "0.0.20230411+0"
+
+[[deps.ExceptionUnwrapping]]
+deps = ["Test"]
+git-tree-sha1 = "dcb08a0d93ec0b1cdc4af184b26b591e9695423a"
+uuid = "460bff9d-24e4-43bc-9d9f-a8973cb893f4"
+version = "0.1.10"
+
+[[deps.Expat_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "1c6317308b9dc757616f0b5cb379db10494443a7"
 uuid = "2e619515-83b5-522b-bb60-26c02a35a201"
-version = "2.4.8+0"
+version = "2.6.2+0"
 
-[[ExprTools]]
-git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d"
+[[deps.ExprTools]]
+git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec"
 uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.8"
+version = "0.1.10"
 
-[[FFMPEG]]
+[[deps.FFMPEG]]
 deps = ["FFMPEG_jll"]
 git-tree-sha1 = "b57e3acbe22f8484b4b5ff66a7499717fe1a9cc8"
 uuid = "c87230d0-a227-11e9-1b43-d7ebe4e7570a"
 version = "0.4.1"
 
-[[FFMPEG_jll]]
-deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "JLLWrappers", "LAME_jll", "Libdl", "Ogg_jll", "OpenSSL_jll", "Opus_jll", "Pkg", "Zlib_jll", "libass_jll", "libfdk_aac_jll", "libvorbis_jll", "x264_jll", "x265_jll"]
-git-tree-sha1 = "d8a578692e3077ac998b50c0217dfd67f21d1e5f"
+[[deps.FFMPEG_jll]]
+deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "JLLWrappers", "LAME_jll", "Libdl", "Ogg_jll", "OpenSSL_jll", "Opus_jll", "PCRE2_jll", "Zlib_jll", "libaom_jll", "libass_jll", "libfdk_aac_jll", "libvorbis_jll", "x264_jll", "x265_jll"]
+git-tree-sha1 = "466d45dc38e15794ec7d5d63ec03d776a9aff36e"
 uuid = "b22a6f82-2f65-5046-a5b2-351ab43fb4e5"
-version = "4.4.0+0"
+version = "4.4.4+1"
 
-[[FFTW]]
+[[deps.FFTW]]
 deps = ["AbstractFFTs", "FFTW_jll", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"]
-git-tree-sha1 = "505876577b5481e50d089c1c68899dfb6faebc62"
+git-tree-sha1 = "4820348781ae578893311153d69049a93d05f39d"
 uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.4.6"
+version = "1.8.0"
 
-[[FFTW_jll]]
+[[deps.FFTW_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "c6033cc3892d0ef5bb9cd29b7f2f0331ea5184ea"
 uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
 version = "3.3.10+0"
 
-[[FileIO]]
+[[deps.FileIO]]
 deps = ["Pkg", "Requires", "UUIDs"]
-git-tree-sha1 = "80ced645013a5dbdc52cf70329399c35ce007fae"
+git-tree-sha1 = "82d8afa92ecf4b52d78d869f038ebfb881267322"
 uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
-version = "1.13.0"
+version = "1.16.3"
 
-[[FixedPointNumbers]]
+[[deps.FileWatching]]
+uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
+
+[[deps.FixedPointNumbers]]
 deps = ["Statistics"]
-git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc"
+git-tree-sha1 = "05882d6995ae5c12bb5f36dd2ed3f61c98cbb172"
 uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
-version = "0.8.4"
+version = "0.8.5"
 
-[[Fontconfig_jll]]
-deps = ["Artifacts", "Bzip2_jll", "Expat_jll", "FreeType2_jll", "JLLWrappers", "Libdl", "Libuuid_jll", "Pkg", "Zlib_jll"]
-git-tree-sha1 = "21efd19106a55620a188615da6d3d06cd7f6ee03"
+[[deps.Fontconfig_jll]]
+deps = ["Artifacts", "Bzip2_jll", "Expat_jll", "FreeType2_jll", "JLLWrappers", "Libdl", "Libuuid_jll", "Zlib_jll"]
+git-tree-sha1 = "db16beca600632c95fc8aca29890d83788dd8b23"
 uuid = "a3f928ae-7b40-5064-980b-68af3947d34b"
-version = "2.13.93+0"
+version = "2.13.96+0"
 
-[[Formatting]]
-deps = ["Printf"]
-git-tree-sha1 = "8339d61043228fdd3eb658d86c926cb282ae72a8"
-uuid = "59287772-0a20-5a39-b81b-1366585eb4c0"
-version = "0.4.2"
+[[deps.Format]]
+git-tree-sha1 = "9c68794ef81b08086aeb32eeaf33531668d5f5fc"
+uuid = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8"
+version = "1.3.7"
 
-[[FreeType2_jll]]
-deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"]
-git-tree-sha1 = "87eb71354d8ec1a96d4a7636bd57a7347dde3ef9"
+[[deps.FreeType2_jll]]
+deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Zlib_jll"]
+git-tree-sha1 = "5c1d8ae0efc6c2e7b1fc502cbe25def8f661b7bc"
 uuid = "d7e528f0-a631-5988-bf34-fe36492bcfd7"
-version = "2.10.4+0"
+version = "2.13.2+0"
 
-[[FriBidi_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "aa31987c2ba8704e23c6c8ba8a4f769d5d7e4f91"
+[[deps.FriBidi_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "1ed150b39aebcc805c26b93a8d0122c940f64ce2"
 uuid = "559328eb-81f9-559d-9380-de523a88c83c"
-version = "1.0.10+0"
+version = "1.0.14+0"
 
-[[Future]]
+[[deps.Future]]
 deps = ["Random"]
 uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
 
-[[GLFW_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "Xorg_libXcursor_jll", "Xorg_libXi_jll", "Xorg_libXinerama_jll", "Xorg_libXrandr_jll"]
-git-tree-sha1 = "51d2dfe8e590fbd74e7a842cf6d13d8a2f45dc01"
+[[deps.GLFW_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Xorg_libXcursor_jll", "Xorg_libXi_jll", "Xorg_libXinerama_jll", "Xorg_libXrandr_jll", "xkbcommon_jll"]
+git-tree-sha1 = "3f74912a156096bd8fdbef211eff66ab446e7297"
 uuid = "0656b61e-2033-5cc2-a64a-77c0f6c09b89"
-version = "3.3.6+0"
+version = "3.4.0+0"
+
+[[deps.GMP_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "781609d7-10c4-51f6-84f2-b8444358ff6d"
+version = "6.2.1+6"
 
-[[GPUArrays]]
-deps = ["Adapt", "LLVM", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
-git-tree-sha1 = "c783e8883028bf26fb05ed4022c450ef44edd875"
+[[deps.GPUArrays]]
+deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
+git-tree-sha1 = "a74c3f1cf56a3dfcdef0605f8cdb7015926aae30"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "8.3.2"
+version = "10.3.0"
 
-[[GPUCompiler]]
-deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "556190e1e0ea3e37d83059fc9aa576f1e2104375"
+[[deps.GPUArraysCore]]
+deps = ["Adapt"]
+git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950"
+uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
+version = "0.1.6"
+
+[[deps.GPUCompiler]]
+deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Preferences", "Scratch", "Serialization", "TOML", "TimerOutputs", "UUIDs"]
+git-tree-sha1 = "ab29216184312f99ff957b32cd63c2fe9c928b91"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.14.1"
+version = "0.26.7"
 
-[[GR]]
-deps = ["Base64", "DelimitedFiles", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Pkg", "Printf", "Random", "RelocatableFolders", "Serialization", "Sockets", "Test", "UUIDs"]
-git-tree-sha1 = "af237c08bda486b74318c8070adb96efa6952530"
+[[deps.GR]]
+deps = ["Artifacts", "Base64", "DelimitedFiles", "Downloads", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Preferences", "Printf", "Qt6Wayland_jll", "Random", "Serialization", "Sockets", "TOML", "Tar", "Test", "p7zip_jll"]
+git-tree-sha1 = "629693584cef594c3f6f99e76e7a7ad17e60e8d5"
 uuid = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71"
-version = "0.64.2"
+version = "0.73.7"
 
-[[GR_jll]]
-deps = ["Artifacts", "Bzip2_jll", "Cairo_jll", "FFMPEG_jll", "Fontconfig_jll", "GLFW_jll", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Libtiff_jll", "Pixman_jll", "Pkg", "Qt5Base_jll", "Zlib_jll", "libpng_jll"]
-git-tree-sha1 = "cd6efcf9dc746b06709df14e462f0a3fe0786b1e"
+[[deps.GR_jll]]
+deps = ["Artifacts", "Bzip2_jll", "Cairo_jll", "FFMPEG_jll", "Fontconfig_jll", "FreeType2_jll", "GLFW_jll", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Libtiff_jll", "Pixman_jll", "Qt6Base_jll", "Zlib_jll", "libpng_jll"]
+git-tree-sha1 = "a8863b69c2a0859f2c2c87ebdc4c6712e88bdf0d"
 uuid = "d2c73de3-f751-5644-a686-071e5b155ba9"
-version = "0.64.2+0"
-
-[[GeometryBasics]]
-deps = ["EarCut_jll", "IterTools", "LinearAlgebra", "StaticArrays", "StructArrays", "Tables"]
-git-tree-sha1 = "83ea630384a13fc4f002b77690bc0afeb4255ac9"
-uuid = "5c1252a2-5f33-56bf-86c9-59e7332b4326"
-version = "0.4.2"
+version = "0.73.7+0"
 
-[[Gettext_jll]]
+[[deps.Gettext_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "XML2_jll"]
 git-tree-sha1 = "9b02998aba7bf074d14de89f9d37ca24a1a0b046"
 uuid = "78b55507-aeef-58d4-861c-77aaff3498b1"
 version = "0.21.0+0"
 
-[[Glib_jll]]
-deps = ["Artifacts", "Gettext_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Libiconv_jll", "Libmount_jll", "PCRE_jll", "Pkg", "Zlib_jll"]
-git-tree-sha1 = "a32d672ac2c967f3deb8a81d828afc739c838a06"
+[[deps.Glib_jll]]
+deps = ["Artifacts", "Gettext_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Libiconv_jll", "Libmount_jll", "PCRE2_jll", "Zlib_jll"]
+git-tree-sha1 = "7c82e6a6cd34e9d935e9aa4051b66c6ff3af59ba"
 uuid = "7746bdde-850d-59dc-9ae8-88ece973131d"
-version = "2.68.3+2"
+version = "2.80.2+0"
 
-[[Glob]]
-git-tree-sha1 = "4df9f7e06108728ebf00a0a11edee4b29a482bb2"
+[[deps.Glob]]
+git-tree-sha1 = "97285bbd5230dd766e9ef6749b80fc617126d496"
 uuid = "c27321d9-0574-5035-807b-f59d2c89b15c"
-version = "1.3.0"
+version = "1.3.1"
+
+[[deps.GnuTLS_jll]]
+deps = ["Artifacts", "GMP_jll", "JLLWrappers", "Libdl", "Nettle_jll", "P11Kit_jll", "Zlib_jll"]
+git-tree-sha1 = "383db7d3f900f4c1f47a8a04115b053c095e48d3"
+uuid = "0951126a-58fd-58f1-b5b3-b08c7c4a876d"
+version = "3.8.4+0"
 
-[[Graphite2_jll]]
+[[deps.Graphite2_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "344bf40dcab1073aca04aa0df4fb092f920e4011"
 uuid = "3b182d85-2403-5c21-9c21-1e1f0cc25472"
 version = "1.3.14+0"
 
-[[Grisu]]
+[[deps.Grisu]]
 git-tree-sha1 = "53bb909d1151e57e2484c3d1b53e19552b887fb2"
 uuid = "42e2da0e-8278-4e71-bc24-59509adca0fe"
 version = "1.0.2"
 
-[[HDF5_jll]]
-deps = ["Artifacts", "JLLWrappers", "LibCURL_jll", "Libdl", "OpenSSL_jll", "Pkg", "Zlib_jll"]
-git-tree-sha1 = "fd83fa0bde42e01952757f01149dd968c06c4dba"
+[[deps.HDF5_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "LibCURL_jll", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "OpenSSL_jll", "TOML", "Zlib_jll", "libaec_jll"]
+git-tree-sha1 = "82a471768b513dc39e471540fdadc84ff80ff997"
 uuid = "0234f1f7-429e-5d53-9886-15a909be8d59"
-version = "1.12.0+1"
+version = "1.14.3+3"
 
-[[HTTP]]
-deps = ["Base64", "Dates", "IniFile", "Logging", "MbedTLS", "NetworkOptions", "Sockets", "URIs"]
-git-tree-sha1 = "0fa77022fe4b511826b39c894c90daf5fce3334a"
+[[deps.HTTP]]
+deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"]
+git-tree-sha1 = "d1d712be3164d61d1fb98e7ce9bcbc6cc06b45ed"
 uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
-version = "0.9.17"
+version = "1.10.8"
 
-[[HarfBuzz_jll]]
+[[deps.HarfBuzz_jll]]
 deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg"]
 git-tree-sha1 = "129acf094d168394e80ee1dc4bc06ec835e510a3"
 uuid = "2e76f6c2-a576-52d4-95c1-20adfe4de566"
 version = "2.8.1+1"
 
-[[IfElse]]
+[[deps.Hwloc_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "5e19e1e4fa3e71b774ce746274364aef0234634e"
+uuid = "e33a78d0-f292-5ffc-b300-72abe9b543c8"
+version = "2.11.1+0"
+
+[[deps.IfElse]]
 git-tree-sha1 = "debdd00ffef04665ccbb3e150747a77560e8fad1"
 uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
 version = "0.1.1"
 
-[[IncompleteLU]]
+[[deps.IncompleteLU]]
 deps = ["LinearAlgebra", "SparseArrays"]
-git-tree-sha1 = "a22b92ffedeb499383720dfedcd473deb9608b62"
+git-tree-sha1 = "6c676e79f98abb6d33fa28122cad099f1e464afe"
 uuid = "40713840-3770-5561-ab4c-a76e7d0d7895"
-version = "0.2.0"
+version = "0.2.1"
+
+[[deps.InlineStrings]]
+git-tree-sha1 = "45521d31238e87ee9f9732561bfee12d4eebd52d"
+uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
+version = "1.4.2"
 
-[[IniFile]]
-git-tree-sha1 = "f550e6e32074c939295eb5ea6de31849ac2c9625"
-uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
-version = "0.5.1"
+    [deps.InlineStrings.extensions]
+    ArrowTypesExt = "ArrowTypes"
+    ParsersExt = "Parsers"
 
-[[IntelOpenMP_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "d979e54b71da82f3a65b62553da4fc3d18c9004c"
+    [deps.InlineStrings.weakdeps]
+    ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
+    Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+
+[[deps.IntelOpenMP_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "14eb2b542e748570b56446f4c50fbfb2306ebc45"
 uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
-version = "2018.0.3+2"
+version = "2024.2.0+0"
 
-[[InteractiveUtils]]
+[[deps.InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
-[[InverseFunctions]]
-deps = ["Test"]
-git-tree-sha1 = "91b5dcf362c5add98049e6c29ee756910b03051d"
-uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
-version = "0.1.3"
-
-[[InvertedIndices]]
-git-tree-sha1 = "bee5f1ef5bf65df56bdd2e40447590b272a5471f"
+[[deps.InvertedIndices]]
+git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
 uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
-version = "1.1.0"
+version = "1.3.0"
 
-[[IrrationalConstants]]
-git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
+[[deps.IrrationalConstants]]
+git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
-version = "0.1.1"
-
-[[IterTools]]
-git-tree-sha1 = "fa6287a4469f5e048d763df38279ee729fbd44e5"
-uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
-version = "1.4.0"
+version = "0.2.2"
 
-[[IterativeSolvers]]
+[[deps.IterativeSolvers]]
 deps = ["LinearAlgebra", "Printf", "Random", "RecipesBase", "SparseArrays"]
-git-tree-sha1 = "1169632f425f79429f245113b775a0e3d121457c"
+git-tree-sha1 = "59545b0a2b27208b0650df0a46b8e3019f85055b"
 uuid = "42fd0dbc-a981-5370-80f2-aaf504508153"
-version = "0.9.2"
+version = "0.9.4"
 
-[[IteratorInterfaceExtensions]]
+[[deps.IteratorInterfaceExtensions]]
 git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
 uuid = "82899510-4779-5014-852e-03e436cf321d"
 version = "1.0.0"
 
-[[JLD2]]
-deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "Pkg", "Printf", "Reexport", "TranscodingStreams", "UUIDs"]
-git-tree-sha1 = "81b9477b49402b47fbe7f7ae0b252077f53e4a08"
+[[deps.JLD2]]
+deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "Pkg", "PrecompileTools", "Reexport", "Requires", "TranscodingStreams", "UUIDs", "Unicode"]
+git-tree-sha1 = "5fe858cb863e211c6dedc8cce2dc0752d4ab6e2b"
 uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-version = "0.4.22"
+version = "0.4.50"
 
-[[JLLWrappers]]
-deps = ["Preferences"]
-git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
+[[deps.JLFzf]]
+deps = ["Pipe", "REPL", "Random", "fzf_jll"]
+git-tree-sha1 = "a53ebe394b71470c7f97c2e7e170d51df21b17af"
+uuid = "1019f520-868f-41f5-a6de-eb00f4b6a39c"
+version = "0.1.7"
+
+[[deps.JLLWrappers]]
+deps = ["Artifacts", "Preferences"]
+git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
 uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
-version = "1.4.1"
+version = "1.5.0"
 
-[[JSON]]
+[[deps.JSON]]
 deps = ["Dates", "Mmap", "Parsers", "Unicode"]
-git-tree-sha1 = "3c837543ddb02250ef42f4738347454f95079d4e"
+git-tree-sha1 = "31e996f0a15c7b280ba9f76636b3ff9e2ae58c9a"
 uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-version = "0.21.3"
+version = "0.21.4"
 
-[[JSON3]]
-deps = ["Dates", "Mmap", "Parsers", "StructTypes", "UUIDs"]
-git-tree-sha1 = "8c1f668b24d999fb47baf80436194fdccec65ad2"
+[[deps.JSON3]]
+deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"]
+git-tree-sha1 = "95220473901735a0f4df9d1ca5b171b568b2daa3"
 uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
-version = "1.9.4"
+version = "1.13.2"
 
-[[JpegTurbo_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "b53380851c6e6664204efb2e62cd24fa5c47e4ba"
+[[deps.JpegTurbo_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "c84a835e1a09b289ffcd2271bf2a337bbdda6637"
 uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8"
-version = "2.1.2+0"
+version = "3.0.3+0"
 
-[[KernelAbstractions]]
-deps = ["Adapt", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SparseArrays", "StaticArrays", "UUIDs"]
-git-tree-sha1 = "883ea9474c2a091dc6a698b525f47a651ae133b9"
+[[deps.JuliaNVTXCallbacks_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
+uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
+version = "0.2.1+0"
+
+[[deps.KernelAbstractions]]
+deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "d0448cebd5919e06ca5edc7a264631790de810ec"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-version = "0.8.0"
+version = "0.9.22"
 
-[[LAME_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "f6250b16881adf048549549fba48b1161acdac8c"
+    [deps.KernelAbstractions.extensions]
+    EnzymeExt = "EnzymeCore"
+
+    [deps.KernelAbstractions.weakdeps]
+    EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+
+[[deps.LAME_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "170b660facf5df5de098d866564877e119141cbd"
 uuid = "c1c5ebd0-6772-5130-a774-d5fcae4a789d"
-version = "3.100.1+0"
+version = "3.100.2+0"
 
-[[LERC_jll]]
+[[deps.LERC_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "bf36f528eec6634efc60d7ec062008f171071434"
 uuid = "88015f11-f218-50d7-93a8-a6af411a945d"
 version = "3.0.0+1"
 
-[[LLVM]]
-deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "c9b86064be5ae0f63e50816a5a90b08c474507ae"
+[[deps.LLVM]]
+deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"]
+git-tree-sha1 = "020abd49586480c1be84f57da0017b5d3db73f7c"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "4.9.1"
+version = "8.0.0"
+weakdeps = ["BFloat16s"]
 
-[[LLVMExtra_jll]]
-deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
-git-tree-sha1 = "5558ad3c8972d602451efe9d81c78ec14ef4f5ef"
+    [deps.LLVM.extensions]
+    BFloat16sExt = "BFloat16s"
+
+[[deps.LLVMExtra_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "c2636c264861edc6d305e6b4d528f09566d24c5e"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.14+2"
+version = "0.0.30+0"
 
-[[LZO_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "e5b909bcf985c5e2605737d2ce278ed791b89be6"
+[[deps.LLVMLoopInfo]]
+git-tree-sha1 = "2e5c102cfc41f48ae4740c7eca7743cc7e7b75ea"
+uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586"
+version = "1.0.0"
+
+[[deps.LLVMOpenMP_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "d986ce2d884d49126836ea94ed5bfb0f12679713"
+uuid = "1d63c593-3942-5779-bab2-d838dc0a180e"
+version = "15.0.7+0"
+
+[[deps.LRUCache]]
+git-tree-sha1 = "b3cc6698599b10e652832c2f23db3cab99d51b59"
+uuid = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
+version = "1.6.1"
+weakdeps = ["Serialization"]
+
+    [deps.LRUCache.extensions]
+    SerializationExt = ["Serialization"]
+
+[[deps.LZO_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "70c5da094887fd2cae843b8db33920bac4b6f07d"
 uuid = "dd4b983a-f0e5-5f8d-a1b7-129d4a5fb1ac"
-version = "2.10.1+0"
+version = "2.10.2+0"
 
-[[LaTeXStrings]]
-git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996"
+[[deps.LaTeXStrings]]
+git-tree-sha1 = "50901ebc375ed41dbf8058da26f9de442febbbec"
 uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
-version = "1.3.0"
+version = "1.3.1"
 
-[[Latexify]]
-deps = ["Formatting", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "Printf", "Requires"]
-git-tree-sha1 = "46a39b9c58749eefb5f2dc1178cb8fab5332b1ab"
+[[deps.Latexify]]
+deps = ["Format", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "OrderedCollections", "Requires"]
+git-tree-sha1 = "5b0d630f3020b82c0775a51d05895852f8506f50"
 uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316"
-version = "0.15.15"
+version = "0.16.4"
+
+    [deps.Latexify.extensions]
+    DataFramesExt = "DataFrames"
+    SymEngineExt = "SymEngine"
 
-[[LazyArtifacts]]
+    [deps.Latexify.weakdeps]
+    DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+    SymEngine = "123dc426-2d89-5057-bbad-38513e3affd8"
+
+[[deps.LazyArtifacts]]
 deps = ["Artifacts", "Pkg"]
 uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
 
-[[LeftChildRightSiblingTrees]]
+[[deps.LeftChildRightSiblingTrees]]
 deps = ["AbstractTrees"]
-git-tree-sha1 = "b864cb409e8e445688bc478ef87c0afe4f6d1f8d"
+git-tree-sha1 = "fb6803dafae4a5d62ea5cab204b1e657d9737e7f"
 uuid = "1d6d02ad-be62-4b6b-8a6d-2f90e265016e"
-version = "0.1.3"
+version = "0.2.0"
 
-[[LibCURL]]
+[[deps.LibCURL]]
 deps = ["LibCURL_jll", "MozillaCACerts_jll"]
 uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
+version = "0.6.4"
 
-[[LibCURL_jll]]
+[[deps.LibCURL_jll]]
 deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
 uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
+version = "8.4.0+0"
 
-[[LibGit2]]
-deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
+[[deps.LibGit2]]
+deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
-[[LibSSH2_jll]]
+[[deps.LibGit2_jll]]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"]
+uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5"
+version = "1.6.4+0"
+
+[[deps.LibSSH2_jll]]
 deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
 uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
+version = "1.11.0+1"
 
-[[Libdl]]
+[[deps.Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 
-[[Libffi_jll]]
+[[deps.Libffi_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "0b4a5d71f3e5200a7dff793393e09dfc2d874290"
 uuid = "e9f186c6-92d2-5b65-8a66-fee21dc1b490"
 version = "3.2.2+1"
 
-[[Libgcrypt_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"]
-git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae"
+[[deps.Libgcrypt_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll"]
+git-tree-sha1 = "9fd170c4bbfd8b935fdc5f8b7aa33532c991a673"
 uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4"
-version = "1.8.7+0"
+version = "1.8.11+0"
 
-[[Libglvnd_jll]]
+[[deps.Libglvnd_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"]
-git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf"
+git-tree-sha1 = "6f73d1dd803986947b2c750138528a999a6c7733"
 uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29"
-version = "1.3.0+3"
+version = "1.6.0+0"
 
-[[Libgpg_error_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9"
+[[deps.Libgpg_error_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "fbb1f2bef882392312feb1ede3615ddc1e9b99ed"
 uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8"
-version = "1.42.0+0"
+version = "1.49.0+0"
 
-[[Libiconv_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "42b62845d70a619f063a7da093d995ec8e15e778"
+[[deps.Libiconv_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175"
 uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
-version = "1.16.1+1"
+version = "1.17.0+0"
 
-[[Libmount_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "9c30530bf0effd46e15e0fdcf2b8636e78cbbd73"
+[[deps.Libmount_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "0c4f9c4f1a50d8f35048fa0532dabbadf702f81e"
 uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9"
-version = "2.35.0+0"
+version = "2.40.1+0"
 
-[[Libtiff_jll]]
-deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "Pkg", "Zlib_jll", "Zstd_jll"]
-git-tree-sha1 = "c9551dd26e31ab17b86cbd00c2ede019c08758eb"
+[[deps.Libtiff_jll]]
+deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"]
+git-tree-sha1 = "2da088d113af58221c52828a80378e16be7d037a"
 uuid = "89763e89-9b03-5906-acba-b20f662cd828"
-version = "4.3.0+1"
+version = "4.5.1+1"
 
-[[Libuuid_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "7f3efec06033682db852f8b3bc3c1d2b0a0ab066"
+[[deps.Libuuid_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "5ee6203157c120d79034c748a2acba45b82b8807"
 uuid = "38a345b3-de98-5d2b-a5d3-14cd9215e700"
-version = "2.36.0+0"
+version = "2.40.1+0"
 
-[[LinearAlgebra]]
-deps = ["Libdl", "libblastrampoline_jll"]
+[[deps.LinearAlgebra]]
+deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
 uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
-[[LogExpFunctions]]
-deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
-git-tree-sha1 = "76c987446e8d555677f064aaac1145c4c17662f8"
+[[deps.LogExpFunctions]]
+deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
+git-tree-sha1 = "a2d09619db4e765091ee5c6ffe8872849de0feea"
 uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
-version = "0.3.14"
+version = "0.3.28"
+
+    [deps.LogExpFunctions.extensions]
+    LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
+    LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
+    LogExpFunctionsInverseFunctionsExt = "InverseFunctions"
 
-[[Logging]]
+    [deps.LogExpFunctions.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
+    InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
+
+[[deps.Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
-[[MKL_jll]]
-deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
-git-tree-sha1 = "e595b205efd49508358f7dc670a940c790204629"
+[[deps.LoggingExtras]]
+deps = ["Dates", "Logging"]
+git-tree-sha1 = "c1dd6d7978c12545b4179fb6153b9250c96b0075"
+uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36"
+version = "1.0.3"
+
+[[deps.Lz4_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "6c26c5e8a4203d43b5497be3ec5d4e0c3cde240a"
+uuid = "5ced341a-0733-55b8-9ab6-a4889d929147"
+version = "1.9.4+0"
+
+[[deps.MKL_jll]]
+deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "oneTBB_jll"]
+git-tree-sha1 = "f046ccd0c6db2832a9f639e2c669c6fe867e5f4f"
 uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
-version = "2022.0.0+0"
+version = "2024.2.0+0"
 
-[[MPI]]
-deps = ["Distributed", "DocStringExtensions", "Libdl", "MPICH_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "Pkg", "Random", "Requires", "Serialization", "Sockets"]
-git-tree-sha1 = "d56a80d8cf8b9dc3050116346b3d83432b1912c0"
+[[deps.MPI]]
+deps = ["Distributed", "DocStringExtensions", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "PkgVersion", "PrecompileTools", "Requires", "Serialization", "Sockets"]
+git-tree-sha1 = "14cef41baf5b675b192b02a22c710f725ab333a7"
 uuid = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-version = "0.19.2"
+version = "0.20.20"
 
-[[MPICH_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "3dacfc006764fe498515a022c3976b7e133c4008"
+    [deps.MPI.extensions]
+    AMDGPUExt = "AMDGPU"
+    CUDAExt = "CUDA"
+
+    [deps.MPI.weakdeps]
+    AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+    CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[[deps.MPICH_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "Hwloc_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "19d4bd098928a3263693991500d05d74dbdc2004"
 uuid = "7cb0a576-ebde-5e09-9194-50597f1243b4"
-version = "4.0.2+0"
+version = "4.2.2+0"
+
+[[deps.MPIPreferences]]
+deps = ["Libdl", "Preferences"]
+git-tree-sha1 = "c105fe467859e7f6e9a852cb15cb4301126fac07"
+uuid = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
+version = "0.1.11"
 
-[[MacroTools]]
+[[deps.MPItrampoline_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "8c35d5420193841b2f367e658540e8d9e0601ed0"
+uuid = "f1f71cc9-e9ae-5b93-9b94-4fe0e1ad3748"
+version = "5.4.0+0"
+
+[[deps.MacroTools]]
 deps = ["Markdown", "Random"]
-git-tree-sha1 = "3d3e902b31198a27340d0bf00d6ac452866021cf"
+git-tree-sha1 = "2fa9ee3e63fd3a4f7a9a4f4744a52f4856de82df"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.9"
+version = "0.5.13"
 
-[[Markdown]]
+[[deps.Markdown]]
 deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
-[[MbedTLS]]
-deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"]
-git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe"
+[[deps.MbedTLS]]
+deps = ["Dates", "MbedTLS_jll", "MozillaCACerts_jll", "NetworkOptions", "Random", "Sockets"]
+git-tree-sha1 = "c067a280ddc25f196b5e7df3877c6b226d390aaf"
 uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
-version = "1.0.3"
+version = "1.1.9"
 
-[[MbedTLS_jll]]
+[[deps.MbedTLS_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
+version = "2.28.2+1"
 
-[[Measures]]
-git-tree-sha1 = "e498ddeee6f9fdb4551ce855a46f54dbd900245f"
+[[deps.Measures]]
+git-tree-sha1 = "c13304c81eec1ed3af7fc20e75fb6b26092a1102"
 uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
-version = "0.3.1"
+version = "0.3.2"
 
-[[MicrosoftMPI_jll]]
+[[deps.MicrosoftMPI_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "a16aa086d335ed7e0170c5265247db29172af2f9"
+git-tree-sha1 = "f12a29c4400ba812841c6ace3f4efbb6dbb3ba01"
 uuid = "9237b28f-5490-5468-be7b-bb81f5f5e6cf"
-version = "10.1.3+2"
+version = "10.1.4+2"
 
-[[Missings]]
+[[deps.Missings]]
 deps = ["DataAPI"]
-git-tree-sha1 = "bf210ce90b6c9eed32d25dbcae1ebc565df2687f"
+git-tree-sha1 = "ec4f7fbeab05d7747bdf98eb74d130a2a2ed298d"
 uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "1.0.2"
+version = "1.2.0"
 
-[[Mmap]]
+[[deps.Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
-[[MozillaCACerts_jll]]
+[[deps.MozillaCACerts_jll]]
 uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
+version = "2023.1.10"
 
-[[NCDatasets]]
-deps = ["CFTime", "DataStructures", "Dates", "NetCDF_jll", "Printf"]
-git-tree-sha1 = "4e52976dc950413a8458273af8f7eaae5f7e899a"
+[[deps.NCDatasets]]
+deps = ["CFTime", "CommonDataModel", "DataStructures", "Dates", "DiskArrays", "NetCDF_jll", "NetworkOptions", "Printf"]
+git-tree-sha1 = "a640912695952b074672edb5f9aaee2f7f9fd59a"
 uuid = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
-version = "0.12.4"
+version = "0.14.4"
 
-[[NaNMath]]
-git-tree-sha1 = "737a5957f387b17e74d4ad2f440eb330b39a62c5"
+[[deps.NVTX]]
+deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
+git-tree-sha1 = "53046f0483375e3ed78e49190f1154fa0a4083a1"
+uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
+version = "0.3.4"
+
+[[deps.NVTX_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
+uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
+version = "3.1.0+2"
+
+[[deps.NaNMath]]
+deps = ["OpenLibm_jll"]
+git-tree-sha1 = "0877504529a3e5c3343c6f8b4c0381e57e4387e4"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "1.0.0"
+version = "1.0.2"
 
-[[NetCDF_jll]]
-deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "LibCURL_jll", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Pkg", "Zlib_jll", "nghttp2_jll"]
-git-tree-sha1 = "0cf4d1bf2ef45156aed85c9ac5f8c7e697d9288c"
+[[deps.NetCDF_jll]]
+deps = ["Artifacts", "Blosc_jll", "Bzip2_jll", "HDF5_jll", "JLLWrappers", "LazyArtifacts", "LibCURL_jll", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "TOML", "XML2_jll", "Zlib_jll", "Zstd_jll", "libzip_jll"]
+git-tree-sha1 = "4686378c4ae1d1948cfbe46c002a11a4265dcb07"
 uuid = "7243133f-43d8-5620-bbf4-c2c921802cf3"
-version = "400.702.400+0"
+version = "400.902.211+1"
 
-[[NetworkOptions]]
+[[deps.Nettle_jll]]
+deps = ["Artifacts", "GMP_jll", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "eca63e3847dad608cfa6a3329b95ef674c7160b4"
+uuid = "4c82536e-c426-54e4-b420-14f461c4ed8b"
+version = "3.7.2+0"
+
+[[deps.NetworkOptions]]
 uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
+version = "1.2.0"
 
-[[Oceananigans]]
-deps = ["Adapt", "BenchmarkTools", "CUDA", "CUDAKernels", "Crayons", "CubedSphere", "Dates", "DocStringExtensions", "FFTW", "Glob", "IncompleteLU", "InteractiveUtils", "IterativeSolvers", "JLD2", "KernelAbstractions", "LinearAlgebra", "Logging", "MPI", "MPICH_jll", "NCDatasets", "OffsetArrays", "OrderedCollections", "PencilFFTs", "Pkg", "Printf", "Random", "Rotations", "SafeTestsets", "SeawaterPolynomials", "SparseArrays", "Statistics", "StructArrays", "Tullio"]
-path = ".."
+[[deps.Oceananigans]]
+deps = ["Adapt", "CUDA", "Crayons", "CubedSphere", "Dates", "Distances", "DocStringExtensions", "FFTW", "Glob", "IncompleteLU", "InteractiveUtils", "IterativeSolvers", "JLD2", "KernelAbstractions", "LinearAlgebra", "Logging", "MPI", "NCDatasets", "OffsetArrays", "OrderedCollections", "PencilArrays", "PencilFFTs", "Pkg", "Printf", "Random", "Rotations", "SeawaterPolynomials", "SparseArrays", "Statistics", "StructArrays"]
+git-tree-sha1 = "1860ea386acda848ea9190d3136bb25b5410df39"
+repo-rev = "ss/one-sided-upwind"
+repo-url = ".."
 uuid = "9e8cae18-63c1-5223-a75c-80ca9d6e9a09"
-version = "0.76.0"
+version = "0.91.4"
 
-[[OffsetArrays]]
-deps = ["Adapt"]
-git-tree-sha1 = "043017e0bdeff61cfbb7afeb558ab29536bbb5ed"
+    [deps.Oceananigans.extensions]
+    OceananigansEnzymeExt = "Enzyme"
+
+    [deps.Oceananigans.weakdeps]
+    Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+
+[[deps.OffsetArrays]]
+git-tree-sha1 = "1a27764e945a152f7ca7efa04de513d473e9542e"
 uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
-version = "1.10.8"
+version = "1.14.1"
+weakdeps = ["Adapt"]
 
-[[Ogg_jll]]
+    [deps.OffsetArrays.extensions]
+    OffsetArraysAdaptExt = "Adapt"
+
+[[deps.Ogg_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "887579a3eb005446d514ab7aeac5d1d027658b8f"
 uuid = "e7412a2a-1a6e-54c0-be00-318e2571c051"
 version = "1.3.5+1"
 
-[[OpenBLAS_jll]]
+[[deps.OpenBLAS_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
 uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
+version = "0.3.23+4"
 
-[[OpenLibm_jll]]
+[[deps.OpenLibm_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
+version = "0.8.1+2"
 
-[[OpenMPI_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
-git-tree-sha1 = "6340586e076b2abd41f5ba1a3b9c774ec6b30fde"
+[[deps.OpenMPI_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "e25c1778a98e34219a00455d6e4384e017ea9762"
 uuid = "fe0851c0-eecd-5654-98d4-656369965a5c"
-version = "4.1.2+0"
+version = "4.1.6+0"
 
-[[OpenSSL_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "ab05aa4cc89736e95915b01e7279e61b1bfe33b8"
+[[deps.OpenSSL]]
+deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"]
+git-tree-sha1 = "38cb508d080d21dc1128f7fb04f20387ed4c0af4"
+uuid = "4d8831e6-92b7-49fb-bdf8-b643e874388c"
+version = "1.4.3"
+
+[[deps.OpenSSL_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "a028ee3cb5641cccc4c24e90c36b0a4f7707bdf5"
 uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
-version = "1.1.14+0"
+version = "3.0.14+0"
 
-[[OpenSpecFun_jll]]
+[[deps.OpenSpecFun_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
 uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
 version = "0.5.5+0"
 
-[[Opus_jll]]
+[[deps.Opus_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "51a08fb14ec28da2ec7a927c4337e4332c2a4720"
 uuid = "91d4177d-7536-5919-b921-800302f37372"
 version = "1.3.2+0"
 
-[[OrderedCollections]]
-git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
+[[deps.OrderedCollections]]
+git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.4.1"
+version = "1.6.3"
 
-[[PCRE_jll]]
+[[deps.P11Kit_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "b2a7af664e098055a7529ad1a900ded962bca488"
-uuid = "2f80f16e-611a-54ab-bc61-aa92de5b98fc"
-version = "8.44.0+0"
+git-tree-sha1 = "2cd396108e178f3ae8dedbd8e938a18726ab2fbf"
+uuid = "c2071276-7c44-58a7-b746-946036e04d0a"
+version = "0.24.1+0"
 
-[[Parsers]]
-deps = ["Dates"]
-git-tree-sha1 = "1285416549ccfcdf0c50d4997a94331e88d68413"
+[[deps.PCRE2_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "efcefdf7-47ab-520b-bdef-62a2eaa19f15"
+version = "10.42.0+1"
+
+[[deps.PackageExtensionCompat]]
+git-tree-sha1 = "fb28e33b8a95c4cee25ce296c817d89cc2e53518"
+uuid = "65ce6f38-6b18-4e1d-a461-8949797d7930"
+version = "1.0.2"
+weakdeps = ["Requires", "TOML"]
+
+[[deps.Parsers]]
+deps = ["Dates", "PrecompileTools", "UUIDs"]
+git-tree-sha1 = "8489905bcdbcfac64d1daa51ca07c0d8f0283821"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.3.1"
+version = "2.8.1"
 
-[[PencilArrays]]
-deps = ["Adapt", "ArrayInterface", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "Requires", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
-git-tree-sha1 = "690dd0accf18b2a1e26ef10820665f7192dacad3"
+[[deps.PencilArrays]]
+deps = ["Adapt", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "StaticArrayInterface", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
+git-tree-sha1 = "fa85ac32172d96cfdb91dbc53e8e57007e5a2b5a"
 uuid = "0e08944d-e94e-41b1-9406-dcf66b6a9d2e"
-version = "0.16.1"
+version = "0.19.5"
+
+    [deps.PencilArrays.extensions]
+    PencilArraysAMDGPUExt = ["AMDGPU"]
+    PencilArraysDiffEqExt = ["DiffEqBase"]
+    PencilArraysHDF5Ext = ["HDF5"]
 
-[[PencilFFTs]]
+    [deps.PencilArrays.weakdeps]
+    AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+    DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
+    HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+
+[[deps.PencilFFTs]]
 deps = ["AbstractFFTs", "FFTW", "LinearAlgebra", "MPI", "PencilArrays", "Reexport", "TimerOutputs"]
-git-tree-sha1 = "6a8f42f2fab1b0966ba8e3bada37a6bd416636d3"
+git-tree-sha1 = "bd69f3f0ee248cfb4241800aefb705b5ded592ff"
 uuid = "4a48f351-57a6-4416-9ec4-c37015456aae"
-version = "0.13.4"
+version = "0.15.1"
 
-[[Pixman_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "b4f5d02549a10e20780a24fce72bea96b6329e29"
+[[deps.Pipe]]
+git-tree-sha1 = "6842804e7867b115ca9de748a0cf6b364523c16d"
+uuid = "b98c9c47-44ae-5843-9183-064241ee97a0"
+version = "1.3.0"
+
+[[deps.Pixman_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LLVMOpenMP_jll", "Libdl"]
+git-tree-sha1 = "35621f10a7531bc8fa58f74610b1bfb70a3cfc6b"
 uuid = "30392449-352a-5448-841d-b1acce4e97dc"
-version = "0.40.1+0"
+version = "0.43.4+0"
 
-[[Pkg]]
-deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
+[[deps.Pkg]]
+deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+version = "1.10.0"
 
-[[PkgBenchmark]]
+[[deps.PkgBenchmark]]
 deps = ["BenchmarkTools", "Dates", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Pkg", "Printf", "TerminalLoggers", "UUIDs"]
 git-tree-sha1 = "e4a10b7cdb7ec836850e43a4cee196f4e7b02756"
 uuid = "32113eaa-f34f-5b0d-bd6c-c81e245fc73d"
 version = "0.2.12"
 
-[[PlotThemes]]
+[[deps.PkgVersion]]
+deps = ["Pkg"]
+git-tree-sha1 = "f9501cc0430a26bc3d156ae1b5b0c1b47af4d6da"
+uuid = "eebad327-c553-4316-9ea0-9fa01ccd7688"
+version = "0.3.3"
+
+[[deps.PlotThemes]]
 deps = ["PlotUtils", "Statistics"]
-git-tree-sha1 = "8162b2f8547bc23876edd0c5181b27702ae58dce"
+git-tree-sha1 = "6e55c6841ce3411ccb3457ee52fc48cb698d6fb0"
 uuid = "ccf2f8ad-2431-5c83-bf29-c5338b663b6a"
-version = "3.0.0"
+version = "3.2.0"
 
-[[PlotUtils]]
-deps = ["ColorSchemes", "Colors", "Dates", "Printf", "Random", "Reexport", "Statistics"]
-git-tree-sha1 = "bb16469fd5224100e422f0b027d26c5a25de1200"
+[[deps.PlotUtils]]
+deps = ["ColorSchemes", "Colors", "Dates", "PrecompileTools", "Printf", "Random", "Reexport", "Statistics"]
+git-tree-sha1 = "7b1a9df27f072ac4c9c7cbe5efb198489258d1f5"
 uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043"
-version = "1.2.0"
+version = "1.4.1"
 
-[[Plots]]
-deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "GeometryBasics", "JSON", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs", "UnicodeFun", "Unzip"]
-git-tree-sha1 = "d05baca9ec540de3d8b12ef660c7353aae9f9477"
+[[deps.Plots]]
+deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "JLFzf", "JSON", "LaTeXStrings", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "PrecompileTools", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "RelocatableFolders", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "TOML", "UUIDs", "UnicodeFun", "UnitfulLatexify", "Unzip"]
+git-tree-sha1 = "082f0c4b70c202c37784ce4bfbc33c9f437685bf"
 uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
-version = "1.28.1"
-
-[[PooledArrays]]
+version = "1.40.5"
+
+    [deps.Plots.extensions]
+    FileIOExt = "FileIO"
+    GeometryBasicsExt = "GeometryBasics"
+    IJuliaExt = "IJulia"
+    ImageInTerminalExt = "ImageInTerminal"
+    UnitfulExt = "Unitful"
+
+    [deps.Plots.weakdeps]
+    FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+    GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326"
+    IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a"
+    ImageInTerminal = "d8c32880-2388-543b-8c61-d9f865259254"
+    Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
+
+[[deps.PooledArrays]]
 deps = ["DataAPI", "Future"]
-git-tree-sha1 = "a6062fe4063cdafe78f4a0a81cfffb89721b30e7"
+git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
 uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
-version = "1.4.2"
+version = "1.4.3"
+
+[[deps.PrecompileTools]]
+deps = ["Preferences"]
+git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f"
+uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
+version = "1.2.1"
 
-[[Preferences]]
+[[deps.Preferences]]
 deps = ["TOML"]
-git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d"
+git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6"
 uuid = "21216c6a-2e73-6563-6e65-726566657250"
-version = "1.3.0"
+version = "1.4.3"
 
-[[PrettyTables]]
-deps = ["Crayons", "Formatting", "Markdown", "Reexport", "Tables"]
-git-tree-sha1 = "dfb54c4e414caa595a1f2ed759b160f5a3ddcba5"
+[[deps.PrettyTables]]
+deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"]
+git-tree-sha1 = "66b20dd35966a748321d3b2537c4584cf40387c7"
 uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
-version = "1.3.1"
+version = "2.3.2"
 
-[[Printf]]
+[[deps.Printf]]
 deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 
-[[Profile]]
+[[deps.Profile]]
 deps = ["Printf"]
 uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
 
-[[ProgressLogging]]
+[[deps.ProgressBars]]
+deps = ["Printf"]
+git-tree-sha1 = "b437cdb0385ed38312d91d9c00c20f3798b30256"
+uuid = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
+version = "1.5.1"
+
+[[deps.ProgressLogging]]
 deps = ["Logging", "SHA", "UUIDs"]
 git-tree-sha1 = "80d919dee55b9c50e8d9e2da5eeafff3fe58b539"
 uuid = "33c8b6b6-d38a-422a-b730-caa89a2f386c"
 version = "0.1.4"
 
-[[PyCall]]
+[[deps.PyCall]]
 deps = ["Conda", "Dates", "Libdl", "LinearAlgebra", "MacroTools", "Serialization", "VersionParsing"]
-git-tree-sha1 = "1fc929f47d7c151c839c5fc1375929766fb8edcc"
+git-tree-sha1 = "9816a3826b0ebf49ab4926e2b18842ad8b5c8f04"
 uuid = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
-version = "1.93.1"
+version = "1.96.4"
 
-[[PyPlot]]
+[[deps.PyPlot]]
 deps = ["Colors", "LaTeXStrings", "PyCall", "Sockets", "Test", "VersionParsing"]
-git-tree-sha1 = "14c1b795b9d764e1784713941e787e1384268103"
+git-tree-sha1 = "0371ca706e3f295481cbf94c8c36692b072285c2"
 uuid = "d330b81b-6aea-500a-939a-2ce795aea3ee"
-version = "2.10.0"
-
-[[Qt5Base_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "Fontconfig_jll", "Glib_jll", "JLLWrappers", "Libdl", "Libglvnd_jll", "OpenSSL_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libxcb_jll", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_keysyms_jll", "Xorg_xcb_util_renderutil_jll", "Xorg_xcb_util_wm_jll", "Zlib_jll", "xkbcommon_jll"]
-git-tree-sha1 = "c6c0f690d0cc7caddb74cef7aa847b824a16b256"
-uuid = "ea2cea3b-5b76-57ae-a6ef-0a8af62496e1"
-version = "5.15.3+1"
-
-[[Quaternions]]
-deps = ["DualNumbers", "LinearAlgebra", "Random"]
-git-tree-sha1 = "b327e4db3f2202a4efafe7569fcbe409106a1f75"
+version = "2.11.5"
+
+[[deps.Qt6Base_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "Fontconfig_jll", "Glib_jll", "JLLWrappers", "Libdl", "Libglvnd_jll", "OpenSSL_jll", "Vulkan_Loader_jll", "Xorg_libSM_jll", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Xorg_libxcb_jll", "Xorg_xcb_util_cursor_jll", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_keysyms_jll", "Xorg_xcb_util_renderutil_jll", "Xorg_xcb_util_wm_jll", "Zlib_jll", "libinput_jll", "xkbcommon_jll"]
+git-tree-sha1 = "492601870742dcd38f233b23c3ec629628c1d724"
+uuid = "c0090381-4147-56d7-9ebc-da0b1113ec56"
+version = "6.7.1+1"
+
+[[deps.Qt6Declarative_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Qt6Base_jll", "Qt6ShaderTools_jll"]
+git-tree-sha1 = "e5dd466bf2569fe08c91a2cc29c1003f4797ac3b"
+uuid = "629bc702-f1f5-5709-abd5-49b8460ea067"
+version = "6.7.1+2"
+
+[[deps.Qt6ShaderTools_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Qt6Base_jll"]
+git-tree-sha1 = "1a180aeced866700d4bebc3120ea1451201f16bc"
+uuid = "ce943373-25bb-56aa-8eca-768745ed7b5a"
+version = "6.7.1+1"
+
+[[deps.Qt6Wayland_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Qt6Base_jll", "Qt6Declarative_jll"]
+git-tree-sha1 = "729927532d48cf79f49070341e1d918a65aba6b0"
+uuid = "e99dba38-086e-5de3-a5b1-6e4c66e897c3"
+version = "6.7.1+1"
+
+[[deps.Quaternions]]
+deps = ["LinearAlgebra", "Random", "RealDot"]
+git-tree-sha1 = "994cc27cdacca10e68feb291673ec3a76aa2fae9"
 uuid = "94ee1d12-ae83-5a48-8b1c-48b8ff168ae0"
-version = "0.5.6"
+version = "0.7.6"
 
-[[REPL]]
+[[deps.REPL]]
 deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
 uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 
-[[Random]]
-deps = ["SHA", "Serialization"]
+[[deps.Random]]
+deps = ["SHA"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
-[[Random123]]
+[[deps.Random123]]
 deps = ["Random", "RandomNumbers"]
-git-tree-sha1 = "afeacaecf4ed1649555a19cb2cad3c141bbc9474"
+git-tree-sha1 = "4743b43e5a9c4a2ede372de7061eed81795b12e7"
 uuid = "74087812-796a-5b5d-8853-05524746bad3"
-version = "1.5.0"
+version = "1.7.0"
 
-[[RandomNumbers]]
+[[deps.RandomNumbers]]
 deps = ["Random", "Requires"]
 git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
 uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
 version = "1.5.3"
 
-[[RecipesBase]]
-git-tree-sha1 = "6bf3f380ff52ce0832ddd3a2a7b9538ed1bcca7d"
+[[deps.RealDot]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "9f0a1b71baaf7650f4fa8a1d168c7fb6ee41f0c9"
+uuid = "c1ae055f-0cd5-4b69-90a6-9a35b1a98df9"
+version = "0.1.0"
+
+[[deps.RecipesBase]]
+deps = ["PrecompileTools"]
+git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff"
 uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
-version = "1.2.1"
+version = "1.3.4"
 
-[[RecipesPipeline]]
-deps = ["Dates", "NaNMath", "PlotUtils", "RecipesBase"]
-git-tree-sha1 = "dc1e451e15d90347a7decc4221842a022b011714"
+[[deps.RecipesPipeline]]
+deps = ["Dates", "NaNMath", "PlotUtils", "PrecompileTools", "RecipesBase"]
+git-tree-sha1 = "45cf9fd0ca5839d06ef333c8201714e888486342"
 uuid = "01d81517-befc-4cb6-b9ec-a95719d0359c"
-version = "0.5.2"
+version = "0.6.12"
 
-[[Reexport]]
+[[deps.Reexport]]
 git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
 uuid = "189a3867-3050-52da-a836-e630ba90ab69"
 version = "1.2.2"
 
-[[RelocatableFolders]]
+[[deps.RelocatableFolders]]
 deps = ["SHA", "Scratch"]
-git-tree-sha1 = "cdbd3b1338c72ce29d9584fdbe9e9b70eeb5adca"
+git-tree-sha1 = "ffdaf70d81cf6ff22c2b6e733c900c3321cab864"
 uuid = "05181044-ff0b-4ac5-8273-598c1e38db00"
-version = "0.1.3"
+version = "1.0.1"
 
-[[Requires]]
+[[deps.Requires]]
 deps = ["UUIDs"]
 git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
 version = "1.3.0"
 
-[[Rotations]]
-deps = ["LinearAlgebra", "Quaternions", "Random", "StaticArrays", "Statistics"]
-git-tree-sha1 = "3177100077c68060d63dd71aec209373c3ec339b"
+[[deps.Rotations]]
+deps = ["LinearAlgebra", "Quaternions", "Random", "StaticArrays"]
+git-tree-sha1 = "5680a9276685d392c87407df00d57c9924d9f11e"
 uuid = "6038ab10-8711-5258-84ad-4b1120ba62dc"
-version = "1.3.1"
+version = "1.7.1"
+weakdeps = ["RecipesBase"]
 
-[[SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+    [deps.Rotations.extensions]
+    RotationsRecipesBaseExt = "RecipesBase"
 
-[[SafeTestsets]]
-deps = ["Test"]
-git-tree-sha1 = "36ebc5622c82eb9324005cc75e7e2cc51181d181"
-uuid = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
-version = "0.0.1"
+[[deps.SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+version = "0.7.0"
 
-[[Scratch]]
+[[deps.Scratch]]
 deps = ["Dates"]
-git-tree-sha1 = "0b4b7f1393cff97c33891da2a0bf69c6ed241fda"
+git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386"
 uuid = "6c6a2e73-6563-6170-7368-637461726353"
-version = "1.1.0"
+version = "1.2.1"
 
-[[SeawaterPolynomials]]
-git-tree-sha1 = "e610123433b91a1a5fff1f7fefe77a5b4a9d260b"
+[[deps.SeawaterPolynomials]]
+git-tree-sha1 = "6d85acd6de472f8e6da81c61c7c5b6280a55e0bc"
 uuid = "d496a93d-167e-4197-9f49-d3af4ff8fe40"
-version = "0.2.2"
+version = "0.3.4"
 
-[[Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+[[deps.SentinelArrays]]
+deps = ["Dates", "Random"]
+git-tree-sha1 = "ff11acffdb082493657550959d4feb4b6149e73a"
+uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
+version = "1.4.5"
 
-[[SharedArrays]]
-deps = ["Distributed", "Mmap", "Random", "Serialization"]
-uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
+[[deps.Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
-[[Showoff]]
+[[deps.Showoff]]
 deps = ["Dates", "Grisu"]
 git-tree-sha1 = "91eddf657aca81df9ae6ceb20b959ae5653ad1de"
 uuid = "992d4aef-0814-514b-bc4d-f2e9a6c4116f"
 version = "1.0.3"
 
-[[Sockets]]
+[[deps.SimpleBufferStream]]
+git-tree-sha1 = "874e8867b33a00e784c8a7e4b60afe9e037b74e1"
+uuid = "777ac1f9-54b0-4bf8-805c-2214025038e7"
+version = "1.1.0"
+
+[[deps.Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
-[[SortingAlgorithms]]
+[[deps.SortingAlgorithms]]
 deps = ["DataStructures"]
-git-tree-sha1 = "b3363d7460f7d098ca0912c69b082f75625d7508"
+git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085"
 uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
-version = "1.0.1"
+version = "1.2.1"
 
-[[SparseArrays]]
-deps = ["LinearAlgebra", "Random"]
+[[deps.SparseArrays]]
+deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+version = "1.10.0"
 
-[[SpecialFunctions]]
-deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
-git-tree-sha1 = "5ba658aeecaaf96923dce0da9e703bd1fe7666f9"
+[[deps.SpecialFunctions]]
+deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
+git-tree-sha1 = "2f5d4697f21388cbe1ff299430dd169ef97d7e14"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "2.1.4"
+version = "2.4.0"
 
-[[Static]]
-deps = ["IfElse"]
-git-tree-sha1 = "91181e5820a400d1171db4382aa36e7fd19bee27"
-uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
-version = "0.6.3"
+    [deps.SpecialFunctions.extensions]
+    SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
 
-[[StaticArrays]]
-deps = ["LinearAlgebra", "Random", "Statistics"]
-git-tree-sha1 = "cd56bf18ed715e8b09f06ef8c6b781e6cdc49911"
+    [deps.SpecialFunctions.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+
+[[deps.Static]]
+deps = ["CommonWorldInvalidations", "IfElse", "PrecompileTools"]
+git-tree-sha1 = "87d51a3ee9a4b0d2fe054bdd3fc2436258db2603"
+uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
+version = "1.1.1"
+
+[[deps.StaticArrayInterface]]
+deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "PrecompileTools", "Requires", "SparseArrays", "Static", "SuiteSparse"]
+git-tree-sha1 = "8963e5a083c837531298fc41599182a759a87a6d"
+uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718"
+version = "1.5.1"
+weakdeps = ["OffsetArrays", "StaticArrays"]
+
+    [deps.StaticArrayInterface.extensions]
+    StaticArrayInterfaceOffsetArraysExt = "OffsetArrays"
+    StaticArrayInterfaceStaticArraysExt = "StaticArrays"
+
+[[deps.StaticArrays]]
+deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"]
+git-tree-sha1 = "eeafab08ae20c62c44c8399ccb9354a04b80db50"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.4.4"
+version = "1.9.7"
+
+    [deps.StaticArrays.extensions]
+    StaticArraysChainRulesCoreExt = "ChainRulesCore"
+    StaticArraysStatisticsExt = "Statistics"
+
+    [deps.StaticArrays.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
-[[StaticPermutations]]
+[[deps.StaticArraysCore]]
+git-tree-sha1 = "192954ef1208c7019899fbf8049e717f92959682"
+uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
+version = "1.4.3"
+
+[[deps.StaticPermutations]]
 git-tree-sha1 = "193c3daa18ff3e55c1dae66acb6a762c4a3bdb0b"
 uuid = "15972242-4b8f-49a0-b8a1-9ac0e7a1a45d"
 version = "0.3.0"
 
-[[Statistics]]
+[[deps.Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+version = "1.10.0"
 
-[[StatsAPI]]
+[[deps.StatsAPI]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "c82aaa13b44ea00134f8c9c89819477bd3986ecd"
+git-tree-sha1 = "1ff449ad350c9c4cbc756624d6f8a8c3ef56d3ed"
 uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
-version = "1.3.0"
+version = "1.7.0"
 
-[[StatsBase]]
+[[deps.StatsBase]]
 deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"]
-git-tree-sha1 = "8977b17906b0a1cc74ab2e3a05faa16cf08a8291"
+git-tree-sha1 = "5cf7606d6cef84b543b483848d4ae08ad9832b21"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.33.16"
+version = "0.34.3"
 
-[[Strided]]
-deps = ["LinearAlgebra", "TupleTools"]
-git-tree-sha1 = "632001471b256ceac6912c3e64d4b5c65154b216"
+[[deps.Strided]]
+deps = ["LinearAlgebra", "StridedViews", "TupleTools"]
+git-tree-sha1 = "bd9bd1c70cfc115cc3a30213fc725125a6b43652"
 uuid = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
-version = "1.2.2"
+version = "2.1.0"
+
+[[deps.StridedViews]]
+deps = ["LinearAlgebra", "PackageExtensionCompat"]
+git-tree-sha1 = "2917996ce0fa6b8a3a85240a5e9ff930e2aeaa43"
+uuid = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
+version = "0.3.1"
+weakdeps = ["CUDA"]
 
-[[StructArrays]]
-deps = ["Adapt", "DataAPI", "StaticArrays", "Tables"]
-git-tree-sha1 = "8f705dd141733d79aa2932143af6c6e0b6cea8df"
+    [deps.StridedViews.extensions]
+    StridedViewsCUDAExt = "CUDA"
+
+[[deps.StringManipulation]]
+deps = ["PrecompileTools"]
+git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
+uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
+version = "0.3.4"
+
+[[deps.StructArrays]]
+deps = ["ConstructionBase", "DataAPI", "Tables"]
+git-tree-sha1 = "f4dc295e983502292c4c3f951dbb4e985e35b3be"
 uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
-version = "0.6.6"
+version = "0.6.18"
+weakdeps = ["Adapt", "GPUArraysCore", "SparseArrays", "StaticArrays"]
+
+    [deps.StructArrays.extensions]
+    StructArraysAdaptExt = "Adapt"
+    StructArraysGPUArraysCoreExt = "GPUArraysCore"
+    StructArraysSparseArraysExt = "SparseArrays"
+    StructArraysStaticArraysExt = "StaticArrays"
 
-[[StructTypes]]
+[[deps.StructTypes]]
 deps = ["Dates", "UUIDs"]
-git-tree-sha1 = "d24a825a95a6d98c385001212dc9020d609f2d4f"
+git-tree-sha1 = "ca4bccb03acf9faaf4137a9abc1881ed1841aa70"
 uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
-version = "1.8.1"
+version = "1.10.0"
+
+[[deps.SuiteSparse]]
+deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
+uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 
-[[TOML]]
+[[deps.SuiteSparse_jll]]
+deps = ["Artifacts", "Libdl", "libblastrampoline_jll"]
+uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
+version = "7.2.1+1"
+
+[[deps.TOML]]
 deps = ["Dates"]
 uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+version = "1.0.3"
 
-[[TableTraits]]
+[[deps.TableTraits]]
 deps = ["IteratorInterfaceExtensions"]
 git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
 uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
 version = "1.0.1"
 
-[[Tables]]
-deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"]
-git-tree-sha1 = "5ce79ce186cc678bbb5c5681ca3379d1ddae11a1"
+[[deps.Tables]]
+deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "OrderedCollections", "TableTraits"]
+git-tree-sha1 = "598cd7c1f68d1e205689b1c2fe65a9f85846f297"
 uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
-version = "1.7.0"
+version = "1.12.0"
 
-[[Tar]]
+[[deps.Tar]]
 deps = ["ArgTools", "SHA"]
 uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
+version = "1.10.0"
 
-[[TaylorSeries]]
-deps = ["InteractiveUtils", "LinearAlgebra", "Markdown", "Requires", "SparseArrays"]
-git-tree-sha1 = "66f4d1993bae49eeba21a1634b5f65782585a42c"
+[[deps.TaylorSeries]]
+deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"]
+git-tree-sha1 = "1c7170668366821b0c4c4fe03ee78f8d6cf36e2c"
 uuid = "6aa5eb33-94cf-58f4-a9d0-e4b2c4fc25ea"
-version = "0.10.13"
+version = "0.16.0"
+
+    [deps.TaylorSeries.extensions]
+    TaylorSeriesIAExt = "IntervalArithmetic"
+
+    [deps.TaylorSeries.weakdeps]
+    IntervalArithmetic = "d1acc4aa-44c8-5952-acd4-ba5d80a2a253"
+
+[[deps.TensorCore]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "1feb45f88d133a655e001435632f019a9a1bcdb6"
+uuid = "62fd8b95-f654-4bbd-a8a5-9c27f68ccd50"
+version = "0.1.1"
 
-[[TerminalLoggers]]
+[[deps.TerminalLoggers]]
 deps = ["LeftChildRightSiblingTrees", "Logging", "Markdown", "Printf", "ProgressLogging", "UUIDs"]
-git-tree-sha1 = "62846a48a6cd70e63aa29944b8c4ef704360d72f"
+git-tree-sha1 = "f133fab380933d042f6796eda4e130272ba520ca"
 uuid = "5d786b92-1e48-4d6f-9151-6b4477ca9bed"
-version = "0.1.5"
+version = "0.1.7"
 
-[[Test]]
+[[deps.Test]]
 deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
-[[TimerOutputs]]
+[[deps.TimerOutputs]]
 deps = ["ExprTools", "Printf"]
-git-tree-sha1 = "11db03dd5bbc0d2b57a570d228a0f34538c586b1"
+git-tree-sha1 = "5a13ae8a41237cff5ecf34f73eb1b8f42fff6531"
 uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.17"
+version = "0.5.24"
 
-[[TranscodingStreams]]
-deps = ["Random", "Test"]
-git-tree-sha1 = "216b95ea110b5972db65aa90f88d8d89dcb8851c"
+[[deps.TranscodingStreams]]
+git-tree-sha1 = "96612ac5365777520c3c5396314c8cf7408f436a"
 uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
-version = "0.9.6"
+version = "0.11.1"
+weakdeps = ["Random", "Test"]
 
-[[Tullio]]
-deps = ["ChainRulesCore", "DiffRules", "LinearAlgebra", "Requires"]
-git-tree-sha1 = "859e2e9a7222553a0c052e423557cedb49376da9"
-uuid = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
-version = "0.3.4"
+    [deps.TranscodingStreams.extensions]
+    TestExt = ["Test", "Random"]
 
-[[TupleTools]]
-git-tree-sha1 = "3c712976c47707ff893cf6ba4354aa14db1d8938"
+[[deps.TupleTools]]
+git-tree-sha1 = "41d61b1c545b06279871ef1a4b5fcb2cac2191cd"
 uuid = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
-version = "1.3.0"
+version = "1.5.0"
 
-[[URIs]]
-git-tree-sha1 = "97bbe755a53fe859669cd907f2d96aee8d2c1355"
+[[deps.URIs]]
+git-tree-sha1 = "67db6cc7b3821e19ebe75791a9dd19c9b1188f2b"
 uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
-version = "1.3.0"
+version = "1.5.1"
 
-[[UUIDs]]
+[[deps.UUIDs]]
 deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
-[[Unicode]]
+[[deps.Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
-[[UnicodeFun]]
+[[deps.UnicodeFun]]
 deps = ["REPL"]
 git-tree-sha1 = "53915e50200959667e78a92a418594b428dffddf"
 uuid = "1cfade01-22cf-5700-b092-accc4b62d6e1"
 version = "0.4.1"
 
-[[Unzip]]
-git-tree-sha1 = "34db80951901073501137bdbc3d5a8e7bbd06670"
+[[deps.Unitful]]
+deps = ["Dates", "LinearAlgebra", "Random"]
+git-tree-sha1 = "d95fe458f26209c66a187b1114df96fd70839efd"
+uuid = "1986cc42-f94f-5a68-af5c-568840ba703d"
+version = "1.21.0"
+
+    [deps.Unitful.extensions]
+    ConstructionBaseUnitfulExt = "ConstructionBase"
+    InverseFunctionsUnitfulExt = "InverseFunctions"
+
+    [deps.Unitful.weakdeps]
+    ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
+    InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
+
+[[deps.UnitfulLatexify]]
+deps = ["LaTeXStrings", "Latexify", "Unitful"]
+git-tree-sha1 = "975c354fcd5f7e1ddcc1f1a23e6e091d99e99bc8"
+uuid = "45397f5d-5981-4c77-b2b3-fc36d6e9b728"
+version = "1.6.4"
+
+[[deps.UnsafeAtomics]]
+git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
+uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
+version = "0.2.1"
+
+[[deps.UnsafeAtomicsLLVM]]
+deps = ["LLVM", "UnsafeAtomics"]
+git-tree-sha1 = "bf2c553f25e954a9b38c9c0593a59bb13113f9e5"
+uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
+version = "0.1.5"
+
+[[deps.Unzip]]
+git-tree-sha1 = "ca0969166a028236229f63514992fc073799bb78"
 uuid = "41fe7b60-77ed-43a1-b4f0-825fd5a5650d"
-version = "0.1.2"
+version = "0.2.0"
 
-[[VersionParsing]]
+[[deps.VersionParsing]]
 git-tree-sha1 = "58d6e80b4ee071f5efd07fda82cb9fbe17200868"
 uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
 version = "1.3.0"
 
-[[Wayland_jll]]
-deps = ["Artifacts", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg", "XML2_jll"]
-git-tree-sha1 = "3e61f0b86f90dacb0bc0e73a0c5a83f6a8636e23"
+[[deps.Vulkan_Loader_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Wayland_jll", "Xorg_libX11_jll", "Xorg_libXrandr_jll", "xkbcommon_jll"]
+git-tree-sha1 = "2f0486047a07670caad3a81a075d2e518acc5c59"
+uuid = "a44049a8-05dd-5a78-86c9-5fde0876e88c"
+version = "1.3.243+0"
+
+[[deps.Wayland_jll]]
+deps = ["Artifacts", "EpollShim_jll", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg", "XML2_jll"]
+git-tree-sha1 = "7558e29847e99bc3f04d6569e82d0f5c54460703"
 uuid = "a2964d1f-97da-50d4-b82a-358c7fce9d89"
-version = "1.19.0+0"
+version = "1.21.0+1"
 
-[[Wayland_protocols_jll]]
+[[deps.Wayland_protocols_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "4528479aa01ee1b3b4cd0e6faef0e04cf16466da"
+git-tree-sha1 = "93f43ab61b16ddfb2fd3bb13b3ce241cafb0e6c9"
 uuid = "2381bf8a-dfd0-557d-9999-79630e7b1b91"
-version = "1.25.0+0"
+version = "1.31.0+0"
 
-[[XML2_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"]
-git-tree-sha1 = "1acf5bdf07aa0907e0a37d3718bb88d4b687b74a"
+[[deps.XML2_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"]
+git-tree-sha1 = "d9717ce3518dc68a99e6b96300813760d887a01d"
 uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
-version = "2.9.12+0"
+version = "2.13.1+0"
 
-[[XSLT_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"]
-git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a"
+[[deps.XSLT_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "XML2_jll", "Zlib_jll"]
+git-tree-sha1 = "a54ee957f4c86b526460a720dbc882fa5edcbefc"
 uuid = "aed1982a-8fda-507f-9586-7b0439959a61"
-version = "1.1.34+0"
-
-[[Xorg_libX11_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
-git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527"
+version = "1.1.41+0"
+
+[[deps.XZ_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "ac88fb95ae6447c8dda6a5503f3bafd496ae8632"
+uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
+version = "5.4.6+0"
+
+[[deps.Xorg_libICE_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "326b4fea307b0b39892b3e85fa451692eda8d46c"
+uuid = "f67eecfb-183a-506d-b269-f58e52b52d7c"
+version = "1.1.1+0"
+
+[[deps.Xorg_libSM_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libICE_jll"]
+git-tree-sha1 = "3796722887072218eabafb494a13c963209754ce"
+uuid = "c834827a-8449-5923-a945-d239c165b7dd"
+version = "1.2.4+0"
+
+[[deps.Xorg_libX11_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxcb_jll", "Xorg_xtrans_jll"]
+git-tree-sha1 = "afead5aba5aa507ad5a3bf01f58f82c8d1403495"
 uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc"
-version = "1.6.9+4"
+version = "1.8.6+0"
 
-[[Xorg_libXau_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e"
+[[deps.Xorg_libXau_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "6035850dcc70518ca32f012e46015b9beeda49d8"
 uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec"
-version = "1.0.9+4"
+version = "1.0.11+0"
 
-[[Xorg_libXcursor_jll]]
+[[deps.Xorg_libXcursor_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXfixes_jll", "Xorg_libXrender_jll"]
 git-tree-sha1 = "12e0eb3bc634fa2080c1c37fccf56f7c22989afd"
 uuid = "935fb764-8cf2-53bf-bb30-45bb1f8bf724"
 version = "1.2.0+4"
 
-[[Xorg_libXdmcp_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4"
+[[deps.Xorg_libXdmcp_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "34d526d318358a859d7de23da945578e8e8727b7"
 uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05"
-version = "1.1.3+4"
+version = "1.1.4+0"
 
-[[Xorg_libXext_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
-git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3"
+[[deps.Xorg_libXext_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libX11_jll"]
+git-tree-sha1 = "d2d1a5c49fae4ba39983f63de6afcbea47194e85"
 uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3"
-version = "1.3.4+4"
+version = "1.3.6+0"
 
-[[Xorg_libXfixes_jll]]
+[[deps.Xorg_libXfixes_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
 git-tree-sha1 = "0e0dc7431e7a0587559f9294aeec269471c991a4"
 uuid = "d091e8ba-531a-589c-9de9-94069b037ed8"
 version = "5.0.3+4"
 
-[[Xorg_libXi_jll]]
+[[deps.Xorg_libXi_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll", "Xorg_libXfixes_jll"]
 git-tree-sha1 = "89b52bc2160aadc84d707093930ef0bffa641246"
 uuid = "a51aa0fd-4e3c-5386-b890-e753decda492"
 version = "1.7.10+4"
 
-[[Xorg_libXinerama_jll]]
+[[deps.Xorg_libXinerama_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll"]
 git-tree-sha1 = "26be8b1c342929259317d8b9f7b53bf2bb73b123"
 uuid = "d1454406-59df-5ea1-beac-c340f2130bc3"
 version = "1.1.4+4"
 
-[[Xorg_libXrandr_jll]]
+[[deps.Xorg_libXrandr_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll"]
 git-tree-sha1 = "34cea83cb726fb58f325887bf0612c6b3fb17631"
 uuid = "ec84b674-ba8e-5d96-8ba1-2a689ba10484"
 version = "1.5.2+4"
 
-[[Xorg_libXrender_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
-git-tree-sha1 = "19560f30fd49f4d4efbe7002a1037f8c43d43b96"
+[[deps.Xorg_libXrender_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libX11_jll"]
+git-tree-sha1 = "47e45cd78224c53109495b3e324df0c37bb61fbe"
 uuid = "ea2f1a96-1ddc-540d-b46f-429655e07cfa"
-version = "0.9.10+4"
+version = "0.9.11+0"
 
-[[Xorg_libpthread_stubs_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb"
+[[deps.Xorg_libpthread_stubs_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "8fdda4c692503d44d04a0603d9ac0982054635f9"
 uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74"
-version = "0.1.0+3"
+version = "0.1.1+0"
 
-[[Xorg_libxcb_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"]
-git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6"
+[[deps.Xorg_libxcb_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"]
+git-tree-sha1 = "bcd466676fef0878338c61e655629fa7bbc69d8e"
 uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b"
-version = "1.13.0+3"
+version = "1.17.0+0"
 
-[[Xorg_libxkbfile_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"]
-git-tree-sha1 = "926af861744212db0eb001d9e40b5d16292080b2"
+[[deps.Xorg_libxkbfile_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libX11_jll"]
+git-tree-sha1 = "730eeca102434283c50ccf7d1ecdadf521a765a4"
 uuid = "cc61e674-0454-545c-8b26-ed2c68acab7a"
-version = "1.1.0+4"
+version = "1.1.2+0"
+
+[[deps.Xorg_xcb_util_cursor_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_jll", "Xorg_xcb_util_renderutil_jll"]
+git-tree-sha1 = "04341cb870f29dcd5e39055f895c39d016e18ccd"
+uuid = "e920d4aa-a673-5f3a-b3d7-f755a4d47c43"
+version = "0.1.4+0"
 
-[[Xorg_xcb_util_image_jll]]
+[[deps.Xorg_xcb_util_image_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"]
 git-tree-sha1 = "0fab0a40349ba1cba2c1da699243396ff8e94b97"
 uuid = "12413925-8142-5f55-bb0e-6d7ca50bb09b"
 version = "0.4.0+1"
 
-[[Xorg_xcb_util_jll]]
+[[deps.Xorg_xcb_util_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll"]
 git-tree-sha1 = "e7fd7b2881fa2eaa72717420894d3938177862d1"
 uuid = "2def613f-5ad1-5310-b15b-b15d46f528f5"
 version = "0.4.0+1"
 
-[[Xorg_xcb_util_keysyms_jll]]
+[[deps.Xorg_xcb_util_keysyms_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"]
 git-tree-sha1 = "d1151e2c45a544f32441a567d1690e701ec89b00"
 uuid = "975044d2-76e6-5fbe-bf08-97ce7c6574c7"
 version = "0.4.0+1"
 
-[[Xorg_xcb_util_renderutil_jll]]
+[[deps.Xorg_xcb_util_renderutil_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"]
 git-tree-sha1 = "dfd7a8f38d4613b6a575253b3174dd991ca6183e"
 uuid = "0d47668e-0667-5a69-a72c-f761630bfb7e"
 version = "0.3.9+1"
 
-[[Xorg_xcb_util_wm_jll]]
+[[deps.Xorg_xcb_util_wm_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"]
 git-tree-sha1 = "e78d10aab01a4a154142c5006ed44fd9e8e31b67"
 uuid = "c22f9ab0-d5fe-5066-847c-f4bb1cd4e361"
 version = "0.4.1+1"
 
-[[Xorg_xkbcomp_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxkbfile_jll"]
-git-tree-sha1 = "4bcbf660f6c2e714f87e960a171b119d06ee163b"
+[[deps.Xorg_xkbcomp_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxkbfile_jll"]
+git-tree-sha1 = "330f955bc41bb8f5270a369c473fc4a5a4e4d3cb"
 uuid = "35661453-b289-5fab-8a00-3d9160c6a3a4"
-version = "1.4.2+4"
+version = "1.4.6+0"
 
-[[Xorg_xkeyboard_config_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xkbcomp_jll"]
-git-tree-sha1 = "5c8424f8a67c3f2209646d4425f3d415fee5931d"
+[[deps.Xorg_xkeyboard_config_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_xkbcomp_jll"]
+git-tree-sha1 = "691634e5453ad362044e2ad653e79f3ee3bb98c3"
 uuid = "33bec58e-1273-512f-9401-5d533626f822"
-version = "2.27.0+4"
+version = "2.39.0+0"
 
-[[Xorg_xtrans_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845"
+[[deps.Xorg_xtrans_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "e92a1a012a10506618f10b7047e478403a046c77"
 uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10"
-version = "1.4.0+3"
+version = "1.5.0+0"
 
-[[Zlib_jll]]
+[[deps.Zlib_jll]]
 deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.13+1"
 
-[[Zstd_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "e45044cd873ded54b6a5bac0eb5c971392cf1927"
+[[deps.Zstd_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "e678132f07ddb5bfa46857f0d7620fb9be675d3b"
 uuid = "3161d3a3-bdf6-5164-811a-617609db77b4"
-version = "1.5.2+0"
+version = "1.5.6+0"
 
-[[libass_jll]]
+[[deps.eudev_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gperf_jll"]
+git-tree-sha1 = "431b678a28ebb559d224c0b6b6d01afce87c51ba"
+uuid = "35ca27e7-8b34-5b7f-bca9-bdc33f59eb06"
+version = "3.2.9+0"
+
+[[deps.fzf_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "a68c9655fbe6dfcab3d972808f1aafec151ce3f8"
+uuid = "214eeab7-80f7-51ab-84ad-2988db7cef09"
+version = "0.43.0+0"
+
+[[deps.gperf_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "3516a5630f741c9eecb3720b1ec9d8edc3ecc033"
+uuid = "1a1c6b14-54f6-533d-8383-74cd7377aa70"
+version = "3.1.1+0"
+
+[[deps.libaec_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "46bf7be2917b59b761247be3f317ddf75e50e997"
+uuid = "477f73a3-ac25-53e9-8cc3-50b2fa2566f0"
+version = "1.1.2+0"
+
+[[deps.libaom_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "1827acba325fdcdf1d2647fc8d5301dd9ba43a9d"
+uuid = "a4ae2306-e953-59d6-aa16-d00cac43593b"
+version = "3.9.0+0"
+
+[[deps.libass_jll]]
 deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "HarfBuzz_jll", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"]
 git-tree-sha1 = "5982a94fcba20f02f42ace44b9894ee2b140fe47"
 uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0"
 version = "0.15.1+0"
 
-[[libblastrampoline_jll]]
-deps = ["Artifacts", "Libdl", "OpenBLAS_jll"]
+[[deps.libblastrampoline_jll]]
+deps = ["Artifacts", "Libdl"]
 uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
+version = "5.8.0+1"
 
-[[libfdk_aac_jll]]
+[[deps.libevdev_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "141fe65dc3efabb0b1d5ba74e91f6ad26f84cc22"
+uuid = "2db6ffa8-e38f-5e21-84af-90c45d0032cc"
+version = "1.11.0+0"
+
+[[deps.libfdk_aac_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "daacc84a041563f965be61859a36e17c4e4fcd55"
 uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280"
 version = "2.0.2+0"
 
-[[libpng_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"]
-git-tree-sha1 = "94d180a6d2b5e55e447e2d27a29ed04fe79eb30c"
+[[deps.libinput_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "eudev_jll", "libevdev_jll", "mtdev_jll"]
+git-tree-sha1 = "ad50e5b90f222cfe78aa3d5183a20a12de1322ce"
+uuid = "36db933b-70db-51c0-b978-0f229ee0e533"
+version = "1.18.0+0"
+
+[[deps.libpng_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"]
+git-tree-sha1 = "d7015d2e18a5fd9a4f47de711837e980519781a4"
 uuid = "b53b4c65-9356-5827-b1ea-8c7a1a84506f"
-version = "1.6.38+0"
+version = "1.6.43+1"
 
-[[libvorbis_jll]]
+[[deps.libvorbis_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Ogg_jll", "Pkg"]
-git-tree-sha1 = "b910cb81ef3fe6e78bf6acee440bda86fd6ae00c"
+git-tree-sha1 = "490376214c4721cdaca654041f635213c6165cb3"
 uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a"
-version = "1.3.7+1"
+version = "1.3.7+2"
+
+[[deps.libzip_jll]]
+deps = ["Artifacts", "Bzip2_jll", "GnuTLS_jll", "JLLWrappers", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"]
+git-tree-sha1 = "3282b7d16ae7ac3e57ec2f3fa8fafb564d8f9f7f"
+uuid = "337d8026-41b4-5cde-a456-74a10e5b31d1"
+version = "1.10.1+0"
 
-[[nghttp2_jll]]
+[[deps.mtdev_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "814e154bdb7be91d78b6802843f76b6ece642f11"
+uuid = "009596ad-96f7-51b1-9f1b-5ce2d5e8a71e"
+version = "1.1.6+0"
+
+[[deps.nghttp2_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
+version = "1.52.0+1"
+
+[[deps.oneTBB_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "7d0ea0f4895ef2f5cb83645fa689e52cb55cf493"
+uuid = "1317d2d5-d96f-522e-a858-c73665f53c3e"
+version = "2021.12.0+0"
 
-[[p7zip_jll]]
+[[deps.p7zip_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
+version = "17.4.0+2"
 
-[[x264_jll]]
+[[deps.x264_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "4fea590b89e6ec504593146bf8b988b2c00922b2"
 uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a"
 version = "2021.5.5+0"
 
-[[x265_jll]]
+[[deps.x265_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "ee567a171cce03570d77ad3a43e90218e38937a9"
 uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76"
 version = "3.5.0+0"
 
-[[xkbcommon_jll]]
+[[deps.xkbcommon_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Wayland_jll", "Wayland_protocols_jll", "Xorg_libxcb_jll", "Xorg_xkeyboard_config_jll"]
-git-tree-sha1 = "ece2350174195bb31de1a63bea3a41ae1aa593b6"
+git-tree-sha1 = "9c304562909ab2bab0262639bd4f444d7bc2be37"
 uuid = "d8fb68d0-12a3-5cfd-a85a-d49703b185fd"
-version = "0.9.1+5"
+version = "1.4.1+1"
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index 4a0422913b..02d8d9445f 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -11,6 +11,7 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 IncompleteLU = "40713840-3770-5561-ab4c-a76e7d0d7895"
 IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Oceananigans = "9e8cae18-63c1-5223-a75c-80ca9d6e9a09"
@@ -22,4 +23,4 @@ PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee"
 SeawaterPolynomials = "d496a93d-167e-4197-9f49-d3af4ff8fe40"
 
 [compat]
-CUDA = "3.8"
+CUDA = "^5"
diff --git a/benchmark/benchmark_advection_schemes.jl b/benchmark/benchmark_advection_schemes.jl
index a0f7e1be89..26232c17a7 100644
--- a/benchmark/benchmark_advection_schemes.jl
+++ b/benchmark/benchmark_advection_schemes.jl
@@ -8,9 +8,10 @@ using Benchmarks
 
 # Benchmark function
 
-function benchmark_advection_scheme(Arch, Scheme)
+function benchmark_advection_scheme(Arch, Scheme, order)
     grid = RectilinearGrid(Arch(); size=(192, 192, 192), extent=(1, 1, 1))
-    model = NonhydrostaticModel(grid=grid, advection=Scheme())
+    order = Scheme == Centered ? order + 1 : order
+    model = NonhydrostaticModel(grid=grid, advection=Scheme(; order))
 
     time_step!(model, 1) # warmup
 
@@ -24,12 +25,13 @@ end
 # Benchmark parameters
 
 Architectures = has_cuda() ? [CPU, GPU] : [CPU]
-Schemes = (CenteredSecondOrder, CenteredFourthOrder, UpwindBiasedThirdOrder, UpwindBiasedFifthOrder, WENO)
+Schemes = (Centered, UpwindBiased, WENO)
+orders  = (1, 3, 5, 7, 9)
 
 # Run and summarize benchmarks
 
 print_system_info()
-suite = run_benchmarks(benchmark_advection_scheme; Architectures, Schemes)
+suite = run_benchmarks(benchmark_advection_scheme; Architectures, Schemes, orders)
 
 df = benchmarks_dataframe(suite)
 sort!(df, [:Architectures, :Schemes], by=string)
diff --git a/benchmark/src/Benchmarks.jl b/benchmark/src/Benchmarks.jl
index 03a3f62527..aea41a8931 100644
--- a/benchmark/src/Benchmarks.jl
+++ b/benchmark/src/Benchmarks.jl
@@ -88,15 +88,15 @@ end
 
 function benchmarks_pretty_table(df; title="")
     header = propertynames(df) .|> String
-    pretty_table(df, header, nosubheader=true, title=title, title_alignment=:c,
+    pretty_table(String, df; header, title=title, title_alignment=:c,
                  title_autowrap = true, title_same_width_as_table = true)
 
     html_filename = replace(title, ' ' => '_') * ".html"
     @info "Writing $html_filename..."
     open(html_filename, "w") do io
-        html_table = pretty_table(String, df, header, nosubheader=true,
+        html_table = pretty_table(String, df; header,
                                   title=title, title_alignment=:c,
-                                  backend=:html, tf=tf_html_simple)
+                                  backend=Val(:html), tf=tf_html_simple)
         write(io, html_table)
     end
 
diff --git a/docs/Project.toml b/docs/Project.toml
index b64e79f872..a7e0daed23 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -5,6 +5,7 @@ DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Measures = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
 Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
@@ -14,7 +15,7 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 TimesDates = "bdfc003b-8df8-5c39-adcd-3a9087f5df4a"
 
 [compat]
-CairoMakie = "0.11"
+CairoMakie = "0.11, 0.12"
 Documenter = "1"
 DocumenterCitations = "1"
 JLD2 = "0.4"
diff --git a/docs/make.jl b/docs/make.jl
index 83bb574555..101fee7894 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -10,6 +10,8 @@ Distributed.addprocs(2)
     using CairoMakie # to avoid capturing precompilation output by Literate
     CairoMakie.activate!(type = "svg")
 
+    using MPI # for distributed doctests
+
     using Oceananigans
     using Oceananigans.Operators
     using Oceananigans.Diagnostics
@@ -30,8 +32,8 @@ Distributed.addprocs(2)
     const EXAMPLES_DIR = joinpath(@__DIR__, "..", "examples")
     const OUTPUT_DIR   = joinpath(@__DIR__, "src/literated")
 
-    # The examples that take longer to run should be first. This ensures thats
-    # docs built using extra workers is as efficient as possible.
+    # The examples that take longer to run should be first. This ensures that the
+    # docs built which extra workers is as efficient as possible.
     example_scripts = [
         "internal_tide.jl",
         "shallow_water_Bickley_jet.jl",
@@ -46,10 +48,10 @@ Distributed.addprocs(2)
         "one_dimensional_diffusion.jl",
         "internal_wave.jl",
     ]
-
-    @info string("Executing the examples using ", Distributed.nprocs(), " processes")
 end
 
+@info string("Executing the examples using ", Distributed.nprocs(), " processes")
+
 Distributed.pmap(1:length(example_scripts)) do n
     example = example_scripts[n]
     example_filepath = joinpath(EXAMPLES_DIR, example)
@@ -85,9 +87,10 @@ example_pages = [
 
 model_setup_pages = [
     "Overview" => "model_setup/overview.md",
+    "Setting initial conditions" => "model_setup/setting_initial_conditions.md",
     "Architecture" => "model_setup/architecture.md",
     "Number type" => "model_setup/number_type.md",
-    "Grid" => "model_setup/grids.md",
+    "Grid" => "model_setup/legacy_grids.md",
     "Clock" => "model_setup/clock.md",
     "Coriolis (rotation)" => "model_setup/coriolis.md",
     "Tracers" => "model_setup/tracers.md",
@@ -101,7 +104,6 @@ model_setup_pages = [
     "Callbacks" => "model_setup/callbacks.md",
     "Output writers" => "model_setup/output_writers.md",
     "Checkpointing" => "model_setup/checkpointing.md",
-    "Setting initial conditions" => "model_setup/setting_initial_conditions.md"
 ]
 
 physics_pages = [
@@ -146,9 +148,18 @@ pages = [
     "Home" => "index.md",
     "Quick start" => "quick_start.md",
     "Examples" => example_pages,
+    "Grids" => "grids.md",
+    "Fields" => "fields.md",
+    "Operations" => "operations.md",
+    # TODO:
+    #   - Develop the following three tutorials on reductions, simulations, and post-processing
+    #   - Refactor the model setup pages and make them more tutorial-like.
+    # "Averages, integrals, and cumulative integrals" => "reductions_and_accumulations.md",
+    # "Simulations" => simulations.md,
+    # "FieldTimeSeries and post-processing" => field_time_series.md,
+    "Model setup (legacy)" => model_setup_pages,
     "Physics" => physics_pages,
     "Numerical implementation" => numerical_pages,
-    "Model setup" => model_setup_pages,
     "Simulation tips" => "simulation_tips.md",
     "Contributor's guide" => "contributing.md",
     "Gallery" => "gallery.md",
@@ -159,9 +170,10 @@ pages = [
 #####
 ##### Build and deploy docs
 #####
+ci_build = get(ENV, "CI", nothing) == "true"
 
 format = Documenter.HTML(collapselevel = 1,
-                         prettyurls = get(ENV, "CI", nothing) == "true",
+                         prettyurls = ci_build,
                          canonical = "https://clima.github.io/OceananigansDocumentation/stable/",
                          mathengine = MathJax3(),
                          size_threshold = 2^20,
@@ -177,32 +189,36 @@ makedocs(sitename = "Oceananigans.jl",
          modules = [Oceananigans],
          warnonly = [:cross_references],
          doctest = true, # set to false to speed things up
+         draft = false,  # set to true to speed things up
          clean = true,
          checkdocs = :exports) # set to :none to speed things up
 
-@info "Clean up temporary .jld2 and .nc output created by doctests or literated examples..."
-
 """
     recursive_find(directory, pattern)
 
 Return list of filepaths within `directory` that contains the `pattern::Regex`.
 """
-recursive_find(directory, pattern) =
-    mapreduce(vcat, walkdir(directory)) do (root, dirs, files)
-        joinpath.(root, filter(contains(pattern), files))
+function recursive_find(directory, pattern)
+    mapreduce(vcat, walkdir(directory)) do (root, dirs, filenames)
+        matched_filenames = filter(contains(pattern), filenames)
+        map(filename -> joinpath(root, filename), matched_filenames)
     end
+end
+
+@info "Cleaning up temporary .jld2 and .nc output created by doctests or literated examples..."
 
-files = []
 for pattern in [r"\.jld2", r"\.nc"]
-    global files = vcat(files, recursive_find(@__DIR__, pattern))
-end
+    filenames = recursive_find(@__DIR__, pattern)
 
-for file in files
-    rm(file)
+    for filename in filenames
+        rm(filename)
+    end
 end
 
-deploydocs(repo = "github.com/CliMA/OceananigansDocumentation.git",
-           versions = ["stable" => "v^", "dev" => "dev", "v#.#.#"],
-           forcepush = true,
-           push_preview = false,
-           devbranch = "main")
+if ci_build
+    deploydocs(repo = "github.com/CliMA/OceananigansDocumentation.git",
+               versions = ["stable" => "v^", "dev" => "dev", "v#.#.#"],
+               forcepush = true,
+               push_preview = true,
+               devbranch = "main")
+end
diff --git a/docs/src/contributing.md b/docs/src/contributing.md
index d600d92f0f..ba69f72b33 100644
--- a/docs/src/contributing.md
+++ b/docs/src/contributing.md
@@ -18,7 +18,7 @@ The simplest way to contribute to Oceananigans is to create or comment on issues
 
 The most useful bug reports:
 
-* Provide an explicit code snippet --- not just a link --- that reproduces the bug in the latest tagged version of Oceananigans. This is sometimes called the ["minimal working example"](https://en.wikipedia.org/wiki/Minimal_working_example). Reducing bug-producing code to a minimal example can dramatically decrease the time it takes to resolve an issue.
+* Provide an explicit code snippet -- not just a link -- that reproduces the bug in the latest tagged version of Oceananigans. This is sometimes called the ["minimal working example"](https://en.wikipedia.org/wiki/Minimal_working_example). Reducing bug-producing code to a minimal example can dramatically decrease the time it takes to resolve an issue.
 
 * Paste the _entire_ error received when running the code snippet, even if it's unbelievably long.
 
diff --git a/docs/src/field_time_series.md b/docs/src/field_time_series.md
new file mode 100644
index 0000000000..a9dfb0cf8a
--- /dev/null
+++ b/docs/src/field_time_series.md
@@ -0,0 +1,2 @@
+# FieldTimeSeries
+
diff --git a/docs/src/fields.md b/docs/src/fields.md
new file mode 100644
index 0000000000..6d39014d78
--- /dev/null
+++ b/docs/src/fields.md
@@ -0,0 +1,515 @@
+# Fields basics
+
+`Field`s and its relatives are core Oceananigans data structures.
+`Field`s are more or less arrays of `data` located on a `grid`, whose entries
+correspond to the average value of some quantity over some finite-sized volume.
+`Field`s also may contain `boundary_conditions`, may be computed from an `operand` 
+or expression involving other fields, and may cover only a portion of the total
+`indices` spanned by the grid.
+
+## Staggered grids and field locations
+
+Oceananigans ocean-flavored fluids simulations rely fundamentally on
+"staggered grid" numerical methods.
+
+Recall that [grids](@ref grids_tutorial) represent a physical domain divided into finite volumes.
+For example, let's consider a horizontally-periodic, vertically-bounded grid of cells
+that divide up a cube with dimensions ``1 \times 1 \times 1``:
+
+```jldoctest fields
+using Oceananigans
+
+grid = RectilinearGrid(topology = (Periodic, Periodic, Bounded),
+                       size = (4, 5, 4),
+                       halo = (1, 1, 1),
+                       x = (0, 1),
+                       y = (0, 1),
+                       z = [0, 0.1, 0.3, 0.6, 1])
+
+# output
+4×5×4 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 1×1×1 halo
+├── Periodic x ∈ [0.0, 1.0) regularly spaced with Δx=0.25
+├── Periodic y ∈ [0.0, 1.0) regularly spaced with Δy=0.2
+└── Bounded  z ∈ [0.0, 1.0] variably spaced with min(Δz)=0.1, max(Δz)=0.4
+```
+
+The cubic domain is divided into a "primary mesh" of ``4 \times 5 \times 4 = 80`` cells,
+which are evenly spaced in ``x, y`` but variably spaced in ``z``.
+Now, in addition to the primary mesh, the grid defines also a set of "staggered" grids whose cells are
+shifted by half a cell width relative to the primary mesh.
+In other words, the staggered grid cells have a "location" in each direction -- either `Center`,
+and therefore co-located with the primary mesh, or `Face` and located over the interfaces of the
+primary mesh.
+For example, the primary or `Center` cell spacings in ``z`` are
+
+```jldoctest fields
+zspacings(grid, Center())
+
+# output
+4-element view(OffsetArray(::Vector{Float64}, 0:5), 1:4) with eltype Float64:
+ 0.1
+ 0.19999999999999998
+ 0.3
+ 0.4
+```
+
+corresponding to cell interfaces located at `z = [0, 0.1, 0.3, 0.6, 1]`.
+But then for the grid which is staggered in `z` relative to the primary mesh,
+
+```jldoctest fields
+zspacings(grid, Face())
+
+# output
+5-element view(OffsetArray(::Vector{Float64}, -1:5), 1:5) with eltype Float64:
+ 0.1
+ 0.15000000000000002
+ 0.24999999999999994
+ 0.3500000000000001
+ 0.3999999999999999
+```
+
+The cells for the vertically staggered grid have different spacings than the primary mesh.
+That's because the _edges_ of the vertically-staggered mesh coincide with the _nodes_ (the cell centers)
+of the primary mesh. The nodes of the primary mesh are
+
+```jldoctest fields
+znodes(grid, Center(), with_halos=true)
+
+# output
+6-element OffsetArray(::Vector{Float64}, 0:5) with eltype Float64 with indices 0:5:
+ -0.05
+  0.05
+  0.2
+  0.44999999999999996
+  0.8
+  1.2
+```
+
+The center of the leftmost "halo cell" is `z = -0.05`, while the center of the first cell from the left is `z = 0.05`.
+This means that the width of the first cell on the vertically-staggered grid is `0.05 - (-0.05) = 0.1` -- and so on.
+Finally, note that the nodes of the staggered mesh coincide with the cell interfaces of the primary mesh, so:
+
+
+```jldoctest fields
+znodes(grid, Center())
+
+# output
+4-element view(OffsetArray(::Vector{Float64}, 0:5), 1:4) with eltype Float64:
+ 0.05
+ 0.2
+ 0.44999999999999996
+ 0.8
+```
+
+In a three-dimensional domain, there are ``2³ = 8`` meshes -- 1 primary mesh, and 7 meshes that are
+staggered to varying degrees from the primary mesh.
+This system of staggered grids is commonly used in fluid dynamics and was [invented specifically for
+simulations of the atmosphere and ocean](https://en.wikipedia.org/wiki/Arakawa_grids).
+
+### Constructing Fields at specified locations
+
+Every `Field` is associated with either the primary mesh or one of the staggered meshes by
+a three-dimensional "location" associated with each field.
+To build a fully-centered `Field`, for example, we write
+
+```jldoctest fields
+c = Field{Center, Center, Center}(grid)
+
+# output
+4×5×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+├── grid: 4×5×4 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 1×1×1 halo
+├── boundary conditions: FieldBoundaryConditions
+│   └── west: Periodic, east: Periodic, south: Periodic, north: Periodic, bottom: ZeroFlux, top: ZeroFlux, immersed: ZeroFlux
+└── data: 6×7×6 OffsetArray(::Array{Float64, 3}, 0:5, 0:6, 0:5) with eltype Float64 with indices 0:5×0:6×0:5
+    └── max=0.0, min=0.0, mean=0.0
+```
+
+Fully-centered fields also go by the alias `CenterField`,
+
+```jldoctest fields
+c == CenterField(grid)
+
+# output
+true
+```
+
+Many fluid dynamical variables are located at cell centers -- for example, tracers like temperature and salinity.
+Another common type of `Field` we encounter have cells located over the `x`-interfaces of the primary grid,
+
+```jldoctest fields
+u = Field{Face, Center, Center}(grid)
+
+# output
+4×5×4 Field{Face, Center, Center} on RectilinearGrid on CPU
+├── grid: 4×5×4 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 1×1×1 halo
+├── boundary conditions: FieldBoundaryConditions
+│   └── west: Periodic, east: Periodic, south: Periodic, north: Periodic, bottom: ZeroFlux, top: ZeroFlux, immersed: ZeroFlux
+└── data: 6×7×6 OffsetArray(::Array{Float64, 3}, 0:5, 0:6, 0:5) with eltype Float64 with indices 0:5×0:6×0:5
+    └── max=0.0, min=0.0, mean=0.0
+```
+
+which also goes by the alias `u = XFaceField(grid)`.
+The name `u` is suggestive: in the Arakawa type-C grid ('C-grid' for short) used by Oceananigans,
+the `x`-component of the velocity field is stored at `Face, Center, Center` location.
+
+The centers of the `u` cells are shifted to the left relative to the `c` cells:
+
+```jldoctest fields
+@show xnodes(c)
+@show xnodes(u)
+nothing
+
+# output
+xnodes(c) = [0.125, 0.375, 0.625, 0.875]
+xnodes(u) = [0.0, 0.25, 0.5, 0.75]
+```
+
+Notice that the first `u`-node is at `x=0`, the left end of the grid, but the last `u`-node is at `x=0.75`.
+Because the `x`-direction is `Periodic`, the `XFaceField` `u` has 4 cells in `x` -- the cell just right of `x=0.75`
+is the same as the cell at `x=0`.
+
+Because the vertical direction is `Bounded`, however, vertically-staggered fields have more
+vertical cells than `CenterField`s:
+
+```jldoctest fields
+w = Field{Center, Center, Face}(grid)
+
+@show znodes(c)
+@show znodes(w)
+nothing
+
+# output
+znodes(c) = [0.05, 0.2, 0.44999999999999996, 0.8]
+znodes(w) = [0.0, 0.1, 0.3, 0.6, 1.0]
+```
+
+`Field`s at `Center, Center, Face` are also called `ZFaceField`,
+and the vertical velocity is a `ZFaceField` on the C-grid.
+Let's visualize the situation:
+
+```@setup fields
+using Oceananigans
+using CairoMakie
+set_theme!(Theme(fontsize=24))
+CairoMakie.activate!(type="svg")
+
+grid = RectilinearGrid(topology = (Periodic, Periodic, Bounded),
+                       size = (4, 4, 4),
+                       halo = (1, 1, 1),
+                       x = (0, 1),
+                       y = (0, 1),
+                       z = [0, 0.1, 0.3, 0.6, 1])
+
+c = CenterField(grid)
+
+u = XFaceField(grid)
+```
+
+
+```@example fields
+using CairoMakie
+
+fig = Figure(size=(600, 180))
+ax = Axis(fig[1, 1], xlabel="x")
+
+# Visualize the domain
+lines!(ax, [0, 1], [0, 0], color=:gray)
+
+xc = xnodes(c)
+xu = xnodes(u)
+
+scatter!(ax, xc, 0 * xc, marker=:circle, markersize=10, label="Cell centers")
+scatter!(ax, xu, 0 * xu, marker=:vline, markersize=20, label="Cell interfaces")
+
+ylims!(ax, -1, 1)
+xlims!(ax, -0.1, 1.1)
+hideydecorations!(ax)
+hidexdecorations!(ax, ticklabels=false, label=false)
+hidespines!(ax)
+
+Legend(fig[0, 1], ax, nbanks=2, framevisible=false)
+
+current_figure()
+```
+
+## Setting `Field`s
+
+`Field`s are full of 0's when they are created, which is not very exciting.
+The situation can be improved using [`set!`](@ref) to change the values of a field.
+For example,
+
+```jldoctest fields
+set!(c, 42)
+
+# output
+4×5×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+├── grid: 4×5×4 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 1×1×1 halo
+├── boundary conditions: FieldBoundaryConditions
+│   └── west: Periodic, east: Periodic, south: Periodic, north: Periodic, bottom: ZeroFlux, top: ZeroFlux, immersed: ZeroFlux
+└── data: 6×7×6 OffsetArray(::Array{Float64, 3}, 0:5, 0:6, 0:5) with eltype Float64 with indices 0:5×0:6×0:5
+    └── max=42.0, min=42.0, mean=42.0
+```
+
+Now `c` is filled with `42`s (for this simple case, we could also have used `c .= 42`).
+Let's confirm that:
+
+```jldoctest fields
+c[1, 1, 1]
+
+# output
+42.0
+```
+
+Looks good. And
+
+```jldoctest fields
+c[1:4, 1:5, 1]
+
+# output
+4×5 Matrix{Float64}:
+ 42.0  42.0  42.0  42.0  42.0
+ 42.0  42.0  42.0  42.0  42.0
+ 42.0  42.0  42.0  42.0  42.0
+ 42.0  42.0  42.0  42.0  42.0
+```
+
+Note that indexing into `c` is the same as indexing into `c.data`.
+
+```jldoctest fields
+c[:, :, :] == c.data
+ 
+# output
+true
+```
+
+We can also `set!` with arrays,
+
+```@setup fields
+using Random
+Random.seed!(123)
+```
+
+```@example fields
+random_stuff = rand(size(c)...)
+set!(c, random_stuff)
+
+heatmap(view(c, :, :, 1))
+```
+
+or even use functions to set,
+
+```jldoctest fields
+fun_stuff(x, y, z) = 2x
+set!(c, fun_stuff)
+
+# output
+
+4×5×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+├── grid: 4×5×4 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 1×1×1 halo
+├── boundary conditions: FieldBoundaryConditions
+│   └── west: Periodic, east: Periodic, south: Periodic, north: Periodic, bottom: ZeroFlux, top: ZeroFlux, immersed: ZeroFlux
+└── data: 6×7×6 OffsetArray(::Array{Float64, 3}, 0:5, 0:6, 0:5) with eltype Float64 with indices 0:5×0:6×0:5
+    └── max=1.75, min=0.25, mean=1.0
+```
+
+```@setup fields
+fun_stuff(x, y, z) = 2x
+set!(c, fun_stuff)
+```
+
+and plot it
+
+```@example fields
+heatmap(view(c, :, :, 1))
+```
+
+For `Field`s on three-dimensional grids, `set!` functions must have arguments `x, y, z` for
+`RectilinearGrid`, or `λ, φ, z` for `LatitudeLongitudeGrid` and `OrthogonalSphericalShellGrid`.
+But for `Field`s on one- and two-dimensional grids, only the arguments that correspond to the
+non-`Flat` directions must be included. For example, to `set!` on a one-dimensional grid we write
+
+```jldoctest fields
+# Make a field on a one-dimensional grid
+one_d_grid = RectilinearGrid(size=7, x=(0, 7), topology=(Periodic, Flat, Flat))
+one_d_c = CenterField(one_d_grid)
+
+# The one-dimensional grid varies only in `x` 
+still_pretty_fun(x) = 3x
+set!(one_d_c, still_pretty_fun)
+
+# output
+7×1×1 Field{Center, Center, Center} on RectilinearGrid on CPU
+├── grid: 7×1×1 RectilinearGrid{Float64, Periodic, Flat, Flat} on CPU with 3×0×0 halo
+├── boundary conditions: FieldBoundaryConditions
+│   └── west: Periodic, east: Periodic, south: Nothing, north: Nothing, bottom: Nothing, top: Nothing, immersed: ZeroFlux
+└── data: 13×1×1 OffsetArray(::Array{Float64, 3}, -2:10, 1:1, 1:1) with eltype Float64 with indices -2:10×1:1×1:1
+    └── max=19.5, min=1.5, mean=10.5
+```
+
+### A bit more about setting with functions
+
+Let's return to the three-dimensional `fun_stuff` case to investigate in more detail how `set!` works with functions.
+The `xnodes` of `c` -- the coordinates of the center of `c`'s finite volumes -- are:
+
+```jldoctest fields
+xc = xnodes(c)
+@show xc
+nothing # hide
+
+# output
+xc = [0.125, 0.375, 0.625, 0.875]
+```
+
+To `set!` the values of `c` we evaluate `fun_stuff` at `c`'s nodes, producing
+
+
+```jldoctest fields
+c[1:4, 1, 1]
+
+# output
+4-element Vector{Float64}:
+ 0.25
+ 0.75
+ 1.25
+ 1.75
+```
+
+!!! note
+    This function-setting method is a first-order method for computing the finite volume
+    of `c` to `fun_stuff`.
+    Higher-order algorithms could be implemented -- have a crack if you're keen.
+
+As a result `set!` can evaluate differently on `Field`s at different locations:
+
+```jldoctest fields
+u = XFaceField(grid)
+set!(u, fun_stuff)
+u[1:4, 1, 1]
+
+# output
+4-element Vector{Float64}:
+ 0.0
+ 0.5
+ 1.0
+ 1.5
+```
+
+## Halo regions and boundary conditions
+
+We built `grid` with `halo = (1, 1, 1)`, which means that the "interior" cells of the grid
+are surrounded by a "halo region" of cells that's one cell thick.
+The number of halo cells in each direction are stored in the properties `Hx, Hy, Hz`, so,
+
+```jldoctest fields
+(grid.Hx, grid.Hy, grid.Hz)
+
+# output
+(1, 1, 1)
+```
+
+`set!` doesn't touch halo cells.
+Check out one of the two-dimensional slices of `c` showing both the interior and the halo
+regions:
+
+
+```jldoctest fields
+c[:, :, 1]
+
+# output
+6×7 OffsetArray(::Matrix{Float64}, 0:5, 0:6) with eltype Float64 with indices 0:5×0:6:
+ 0.0  0.0   0.0   0.0   0.0   0.0   0.0
+ 0.0  0.25  0.25  0.25  0.25  0.25  0.0
+ 0.0  0.75  0.75  0.75  0.75  0.75  0.0
+ 0.0  1.25  1.25  1.25  1.25  1.25  0.0
+ 0.0  1.75  1.75  1.75  1.75  1.75  0.0
+ 0.0  0.0   0.0   0.0   0.0   0.0   0.0
+```
+
+The interior region is populated, but the surrounding halo regions are all 0.
+To remedy this situation we need to `fill_halo_regions!`:
+
+```jldoctest fields
+using Oceananigans.BoundaryConditions: fill_halo_regions!
+
+fill_halo_regions!(c)
+
+c[:, :, 1]
+
+# output
+6×7 OffsetArray(::Matrix{Float64}, 0:5, 0:6) with eltype Float64 with indices 0:5×0:6:
+ 1.75  1.75  1.75  1.75  1.75  1.75  1.75
+ 0.25  0.25  0.25  0.25  0.25  0.25  0.25
+ 0.75  0.75  0.75  0.75  0.75  0.75  0.75
+ 1.25  1.25  1.25  1.25  1.25  1.25  1.25
+ 1.75  1.75  1.75  1.75  1.75  1.75  1.75
+ 0.25  0.25  0.25  0.25  0.25  0.25  0.25
+```
+
+The way the halo regions are filled depends on `c.boundary_conditions`:
+
+```julia
+c.boundary_conditions
+
+# output
+Oceananigans.FieldBoundaryConditions, with boundary conditions
+├── west: PeriodicBoundaryCondition
+├── east: PeriodicBoundaryCondition
+├── south: PeriodicBoundaryCondition
+├── north: PeriodicBoundaryCondition
+├── bottom: FluxBoundaryCondition: Nothing
+├── top: FluxBoundaryCondition: Nothing
+└── immersed: FluxBoundaryCondition: Nothing
+```
+
+Specifically for `c` above, `x` and `y` are `Periodic` while `z` has been assigned
+the default "no-flux" boundary conditions for a `Field` with `Center` location in
+a `Bounded` direction.
+For no-flux boundary conditions, the halo regions of `c` are filled so that derivatives evaluated
+on the boundary return 0.
+To view only the interior cells of `c` we use the function `interior`,
+
+```jldoctest fields
+interior(c, :, :, 1)
+
+# output
+4×5 view(::Array{Float64, 3}, 2:5, 2:6, 2) with eltype Float64:
+ 0.25  0.25  0.25  0.25  0.25
+ 0.75  0.75  0.75  0.75  0.75
+ 1.25  1.25  1.25  1.25  1.25
+ 1.75  1.75  1.75  1.75  1.75
+```
+
+Note that the indices of `c` (and the indices of `c.data`) are "offset" so that index `1`
+corresponds to the first interior cell.
+As a result,
+
+```jldoctest fields
+c[1:4, 1:5, 1] == interior(c, :, :, 1)
+
+# output
+true
+```
+
+and more generally
+
+```jldoctest fields
+typeof(c.data)
+
+# output
+OffsetArrays.OffsetArray{Float64, 3, Array{Float64, 3}}
+```
+
+Thus, for example, the `x`-indices of `c.data` vary from `1 - Hx` to `Nx + Hx` -- in this
+case, from `0` to `5`.
+The underlying array can be accessed with `parent(c)`.
+But note that the "parent" array does not have offset indices, so
+
+```jldoctest fields
+@show parent(c)[1:2, 2, 2]
+@show c.data[1:2, 1, 1]
+nothing
+
+# output
+(parent(c))[1:2, 2, 2] = [1.75, 0.25]
+c.data[1:2, 1, 1] = [0.25, 0.75]
+```
diff --git a/docs/src/grids.md b/docs/src/grids.md
new file mode 100644
index 0000000000..71ec02adc0
--- /dev/null
+++ b/docs/src/grids.md
@@ -0,0 +1,582 @@
+# [Grids](@id grids_tutorial)
+
+```@meta
+DocTestSetup = quote
+    using Oceananigans
+    using CairoMakie
+    CairoMakie.activate!(type = "svg")
+    set_theme!(Theme(fontsize=24))
+end
+```
+
+Oceananigans simulates the dynamics of ocean-flavored fluids by solving equations that conserve momentum, mass, and energy on a grid of finite volumes or "cells".
+The first decision we make when setting up a simulation is: on what _grid_ are we going to run our simulation?
+The "grid" captures the
+
+1. The geometry of the physical domain;
+2. The way that domain is divided into a mesh of finite volumes;
+3. The machine architecture (CPU, GPU, lots of CPUs or lots of GPUs); and
+4. The precision of floating point numbers (double precision or single precision).
+
+We start by making a simple grid that divides a three-dimensional rectangular domain -- "a box" -- into evenly-spaced cells,
+
+```jldoctest grids
+using Oceananigans
+
+grid = RectilinearGrid(topology = (Periodic, Periodic, Bounded),
+                       size = (16, 8, 4),
+                       x = (0, 64),
+                       y = (0, 32),
+                       z = (0, 8))
+
+# output
+16×8×4 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
+├── Periodic x ∈ [0.0, 64.0) regularly spaced with Δx=4.0
+├── Periodic y ∈ [0.0, 32.0) regularly spaced with Δy=4.0
+└── Bounded  z ∈ [0.0, 8.0]  regularly spaced with Δz=2.0
+```
+
+This simple grid
+
+* Has a domain that's "periodic" in ``x, y``, but bounded in ``z``.
+* Has `16` cells in `x`, `8` cells in `y`, and `4` cells in `z`. That means there are ``16 \times 8 \times 4 = 512`` cells in all.
+* Has an `x` dimension that spans from `x=0`, to `x=64`. And `y` spans `y=0` to `y=32`, and `z` spans `z=0` to `z=8`.
+* Has cells that are all the same size, dividing the box in 512 that each has dimension ``4 \times 4 \times 2``.
+  Note that length units are whatever is used to construct the grid, so it's up to the user to make sure that all inputs use consistent units.
+
+In building our first grid, we did not specify whether it should be constructed on the [`CPU`](@ref)` or [`GPU`](@ref).
+As a result, the grid was constructed by default on the CPU.
+Next we build a grid on the _GPU_ that's two-dimensional in ``x, z`` and has variably-spaced cell interfaces in the `z`-direction,
+
+```jldoctest grids_gpu
+architecture = GPU()
+z_faces = [0, 1, 3, 6, 10]
+
+grid = RectilinearGrid(architecture,
+                       topology = (Periodic, Flat, Bounded),
+                       size = (10, 4),
+                       x = (0, 20),
+                       z = z_faces)
+
+# output
+10×1×4 RectilinearGrid{Float64, Periodic, Flat, Bounded} on GPU with 3×0×3 halo
+├── Periodic x ∈ [0.0, 20.0) regularly spaced with Δx=2.0
+├── Flat y
+└── Bounded  z ∈ [0.0, 10.0] variably spaced with min(Δz)=1.0, max(Δz)=4.0
+```
+
+!!! note "GPU architecture requires a CUDA-enabled device"
+    To run the above example and create a grid on the GPU, an Nvidia GPU has to be available
+    and [`CUDA.jl`](https://cuda.juliagpu.org/stable/) must be working). For more information
+    see the [`CUDA.jl` documentation](https://cuda.juliagpu.org/stable/).
+
+The ``y``-dimension is "missing" because it's marked `Flat` in `topology = (Periodic, Flat, Bounded)`.
+So nothing varies in ``y``: `y`-derivatives are 0.
+Also, the keyword argument (or "kwarg" for short) that specifies the ``y``-domains may be omitted, and `size` has only two elements rather than 3 as in the first example.
+In the stretched cell interfaces specified by `z_interfaces`, the number of
+vertical cell interfaces is `Nz + 1 = length(z_interfaces) = 5`, where `Nz = 4` is the number
+of cells in the vertical.
+
+A bit later in this tutorial, we'll give examples that illustrate how to build a grid thats [`Distributed`](@ref) across _multiple_ CPUs and GPUs.
+
+## Grid types: squares, shells, and mountains
+
+The shape of the physical domain determines what grid type should be used:
+
+1. [`RectilinearGrid`](@ref Oceananigans.Grids.RectilinearGrid) can be fashioned into lines, rectangles and boxes.
+2. [`LatitudeLongitudeGrid`](@ref Oceananigans.Grids.LatitudeLongitudeGrid) represents sectors of thin spherical shells, with cells bounded by lines of constant latitude and longitude.
+3. [`OrthogonalSphericalShellGrid`](@ref Oceananigans.Grids.OrthogonalSphericalShellGrid) represents sectors of thin spherical shells divided with mesh lines that intersect at right angles (thus, orthogonal) but are otherwise arbitrary.
+
+!!! note "OrthogonalSphericalShellGrids.jl"
+    See the auxiliary package [`OrthogonalSphericalShellGrids.jl`](https://github.com/CliMA/OrthogonalSphericalShellGrids.jl)
+    for recipes that implement some useful `OrthogonalSphericalShellGrid`, including the
+    ["tripolar" grid](https://www.sciencedirect.com/science/article/abs/pii/S0021999196901369).
+
+For example, to make a `LatitudeLongitudeGrid` that wraps around the sphere, extends for 60 degrees latitude on either side of the equator, and also has 5 vertical levels down to 1000 meters, we write
+
+```jldoctest grids
+architecture = CPU()
+
+grid = LatitudeLongitudeGrid(architecture,
+                             size = (180, 10, 5),
+                             longitude = (-180, 180),
+                             latitude = (-60, 60),
+                             z = (-1000, 0))
+
+# output
+180×10×5 LatitudeLongitudeGrid{Float64, Periodic, Bounded, Bounded} on CPU with 3×3×3 halo and with precomputed metrics
+├── longitude: Periodic λ ∈ [-180.0, 180.0) regularly spaced with Δλ=2.0
+├── latitude:  Bounded  φ ∈ [-60.0, 60.0]   regularly spaced with Δφ=12.0
+└── z:         Bounded  z ∈ [-1000.0, 0.0]  regularly spaced with Δz=200.0
+```
+
+The main difference between the syntax for `LatitudeLongitudeGrid` versus that for the `RectilinearGrid` are the names of the horizontal coordinates:
+`LatitudeLongitudeGrid` has `longitude` and `latitude` where `RectilinearGrid` has `x` and `y`.
+
+!!! note "Extrinsic and intrinsic coordinate systems"
+    Every grid is associated with an "extrinsic" coordinate system: `RectilinearGrid` uses a Cartesian coordinate system,
+    while `LatitudeLongitudeGrid` and `OrthogonalSphericalShellGrid` use the geographic coordinates
+    `(λ, φ, z)`, where `λ` is longitude, `φ` is latitude, and `z` is height.
+    Additionally, `OrthogonalSphericalShellGrid` has an "intrinsic" coordinate system associated with the orientation
+    of its finite volumes (which, in general, are not aligned with geographic coordinates).
+    To type `λ` or `φ` at the REPL, write either `\lambda` (for `λ`) or `\varphi` (for `φ`) and then press `<TAB>`.
+
+If `topology` is not provided for `LatitudeLongitudeGrid`, then we try to infer it: if the `longitude` spans 360 degrees,
+the default `x`-topology is `Periodic`; if `longitude` spans less than 360 degrees `x`-topology is `Bounded`.
+For example,
+
+```jldoctest grids
+grid = LatitudeLongitudeGrid(size = (60, 10, 5),
+                             longitude = (0, 60),
+                             latitude = (-60, 60),
+                             z = (-1000, 0))
+
+# output
+60×10×5 LatitudeLongitudeGrid{Float64, Bounded, Bounded, Bounded} on CPU with 3×3×3 halo and with precomputed metrics
+├── longitude: Bounded  λ ∈ [0.0, 60.0]    regularly spaced with Δλ=1.0
+├── latitude:  Bounded  φ ∈ [-60.0, 60.0]  regularly spaced with Δφ=12.0
+└── z:         Bounded  z ∈ [-1000.0, 0.0] regularly spaced with Δz=200.0
+```
+
+is `Bounded` by default, because `longitude = (0, 60)`.
+
+!!! note "LatitudeLongitudeGrid topologies"
+    It's still possible to use `topology = (Periodic, Bounded, Bounded)` if `longitude` doesn't have 360 degrees.
+    But neither `latitude` nor `z` may be `Periodic` with `LatitudeLongitudeGrid`.
+
+### Bathymetry, topography, and other irregularities
+
+Irregular or "complex" domains are represented with [`ImmersedBoundaryGrid`](@ref), which combines one of the
+above underlying grids with a type of immersed boundary. The immersed boundaries we support currently are
+
+1. [`GridFittedBottom`](@ref), which fits a one- or two-dimensional bottom height to the underlying grid, so the active part of the domain is above the bottom height.
+2. [`PartialCellBottom`](@ref Oceananigans.ImmersedBoundaries.PartialCellBottom), which is similar to [`GridFittedBottom`](@ref), except that the height of the bottommost cell is changed to conform to bottom height, limited to prevent the bottom cells from becoming too thin.
+3. [`GridFittedBoundary`](@ref), which fits a three-dimensional mask to the grid.
+
+
+To build an `ImmersedBoundaryGrid`, we start by building one of the three underlying grids, and then embedding a boundary into that underlying grid.
+
+```jldoctest grids
+using Oceananigans.Units
+
+grid = RectilinearGrid(topology = (Bounded, Bounded, Bounded),
+                       size = (20, 20, 20),
+                       x = (-5kilometers, 5kilometers),
+                       y = (-5kilometers, 5kilometers),
+                       z = (0, 1kilometer))
+
+# Height and width
+H = 100meters
+W = 1kilometer
+
+mountain(x, y) = H * exp(-(x^2 + y^2) / 2W^2)
+mountain_grid = ImmersedBoundaryGrid(grid, GridFittedBottom(mountain))
+
+# output
+20×20×20 ImmersedBoundaryGrid{Float64, Bounded, Bounded, Bounded} on CPU with 3×3×3 halo:
+├── immersed_boundary: GridFittedBottom(mean(z)=6.28318, min(z)=1.58939e-8, max(z)=93.9413)
+├── underlying_grid: 20×20×20 RectilinearGrid{Float64, Bounded, Bounded, Bounded} on CPU with 3×3×3 halo
+├── Bounded  x ∈ [-5000.0, 5000.0] regularly spaced with Δx=500.0
+├── Bounded  y ∈ [-5000.0, 5000.0] regularly spaced with Δy=500.0
+└── Bounded  z ∈ [0.0, 1000.0]     regularly spaced with Δz=50.0
+```
+
+Yep, that's a Gaussian mountain:
+
+```@setup grids
+using Oceananigans
+using Oceananigans.Units
+
+using CairoMakie
+CairoMakie.activate!(type = "svg")
+set_theme!(Theme(fontsize=24))
+
+grid = RectilinearGrid(topology = (Bounded, Bounded, Bounded),
+                       size = (20, 20, 20),
+                       x = (-5kilometers, 5kilometers),
+                       y = (-5kilometers, 5kilometers),
+                       z = (0, 1kilometer))
+
+H = 100meters
+W = 1kilometer
+
+mountain(x, y) = H * exp(-(x^2 + y^2) / 2W^2)
+mountain_grid = ImmersedBoundaryGrid(grid, GridFittedBottom(mountain))
+```
+
+```@example grids
+using CairoMakie
+
+h = mountain_grid.immersed_boundary.bottom_height
+
+fig = Figure(size=(600, 600))
+ax = Axis(fig[2, 1], xlabel="x (m)", ylabel="y (m)", aspect=1)
+hm = heatmap!(ax, h)
+Colorbar(fig[1, 1], hm, vertical=false, label="Bottom height (m)")
+
+current_figure()
+```
+
+## Once more with feeling
+
+In summary, making a grid requires 
+
+* The machine architecture, or whether data is stored on the CPU, GPU, or distributed across multiple devices or nodes.
+* Information about the domain geometry. Domains can take a variety of shapes, including
+    - lines (one-dimensional),
+    - rectangles (two-dimensional),
+    - boxes (three-dimensional),
+    - sectors of a thin spherical shells (two- or three-dimensional).
+    Irregular domains -- such as domains that include bathymetry or topography -- are represented by using a masking technique to "immerse" an irregular boundary within an "underlying" regular grid. Part of specifying the shape of the domain also requires specifying the nature of each dimension, which may be
+    - [`Periodic`](@ref), which means that the dimension circles back onto itself: information leaving the left side of the domain re-enters on the right.
+    - [`Bounded`](@ref), which means that the two sides of the dimension are either impenetrable (solid walls), or "open", representing a specified external state.
+    - [`Flat`](@ref), which means nothing can vary in that dimension, reducing the overall dimensionality of the grid.
+* Defining the number of cells that divide each dimension. The number of cells, with or without explicit specification of the cell interfaces, determines the spatial resolution of the grid.
+* The representation of floating point numbers, which can be single-precision (`Float32`) or double precision (`Float64`).
+
+Let's dive into each of these options in more detail.
+
+### Specifying the machine architecture
+
+The positional argument `CPU()` or `GPU()`, specifies the "architecture" of the simulation.
+By using `architecture = GPU()`, any fields constructed on `grid` store their data on
+an Nvidia [`GPU`](@ref), if one is available. By default, the grid will be constructed on
+the [`CPU`](@ref) if this argument is omitted.
+So, for example,
+
+```jldoctest grids
+grid     = RectilinearGrid(size=3, z=(0, 1), topology=(Flat, Flat, Bounded))
+cpu_grid = RectilinearGrid(CPU(), size=3, z=(0, 1), topology=(Flat, Flat, Bounded))
+
+grid == cpu_grid
+
+# output
+true
+```
+
+To use more than one CPU, we use the `Distributed` architecture,
+
+```jldoctest grids
+child_architecture = CPU()
+architecture = Distributed(child_architecture)
+
+# output
+[ Info: MPI has not been initialized, so we are calling MPI.Init().
+Distributed{CPU} across 1 rank:
+├── local_rank: 0 of 0-0
+└── local_index: [1, 1, 1]
+```
+
+which allows us to distributed computations across either CPUs or GPUs.
+In this case, we didn't launch `julia` on multiple nodes using [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface),
+so we're only "distributed" across 1 node.
+<!-- For more, see [Distributed grids](@ref). -->
+More details on Distributed grids in a separate section.
+
+### Specifying the topology for each dimension
+
+The keyword argument `topology` determines if the grid is
+one-, two-, or three-dimensional (the current case), and additionally specifies the nature of each dimension.
+`topology` is always a `Tuple` with three elements (a 3-`Tuple`).
+For `RectilinearGrid`, the three elements correspond to ``(x, y, z)`` and indicate whether the respective direction is `Periodic`, `Bounded`, or `Flat`.
+A few more examples are,
+
+```julia
+topology = (Periodic, Periodic, Periodic) # triply periodic
+topology = (Periodic, Periodic, Bounded)  # periodic in x, y, bounded in z
+topology = (Periodic, Bounded, Bounded)   # periodic in x, but bounded in y, z (a "channel")
+topology = (Bounded, Bounded, Bounded)    # bounded in x, y, z (a closed box)
+topology = (Periodic, Periodic, Flat)     # two-dimensional, doubly-periodic in x, y (a torus)
+topology = (Periodic, Flat, Flat)         # one-dimensional, periodic in x (a line)
+topology = (Flat, Flat, Bounded)          # one-dimensional and bounded in z (a single column)
+```
+
+### Specifying the size of the grid
+
+The `size` is a `Tuple` that specifes the number of grid points in each direction.
+The number of tuple elements corresponds to the number of dimensions that are not `Flat`.
+
+#### The halo size
+
+An additional keyword argument `halo` allows us to set the number of "halo cells" that surround the core "interior" grid.
+The default is 3 for each non-flat coordinate.
+But we can change the halo size, for example,
+
+```jldoctest grids
+big_halo_grid = RectilinearGrid(topology = (Periodic, Periodic, Flat),
+                                size = (32, 16),
+                                halo = (7, 7),
+                                x = (0, 2π),
+                                y = (0, π))
+
+# output
+32×16×1 RectilinearGrid{Float64, Periodic, Periodic, Flat} on CPU with 7×7×0 halo
+├── Periodic x ∈ [-6.90805e-17, 6.28319) regularly spaced with Δx=0.19635
+├── Periodic y ∈ [-1.07194e-16, 3.14159) regularly spaced with Δy=0.19635
+└── Flat z
+```
+
+The `halo` size has to be set for certain advection schemes that require more halo points than the default `3` in every direction.
+Note that both `size` and `halo` are 2-`Tuple`s, rather than the 3-`Tuple` that would be required for a three-dimensional grid,
+or the single number that would be used for a one-dimensional grid.
+
+### The dimensions: `x, y, z` for `RectilinearGrid`, or `latitude, longitude, z` for `LatitudeLongitudeGrid`
+
+These keyword arguments specify the extent and location of the finite volume cells that divide up the
+three dimensions of the grid.
+For `RectilinearGrid`, the dimensions are called `x`, `y`, and `z`, whereas for `LatitudeLongitudeGrid` the
+dimensions are called `latitude`, `longitude`, and `z`.
+The type of each keyword argument determines how the dimension is divided up:
+
+* Tuples that specify only the end points indicate that the dimension should be divided into
+  equally-spaced cells. For example, `x = (0, 64)` with `size = (16, 8, 4)` means that the
+  `x`-dimension is divided into 16 cells, where the first or leftmost cell interface is located
+  at `x = 0` and the last or rightmost cell interface is located at `x = 64`. The width of each cell is `Δx=4.0`.
+* Vectors and functions alternatively give the location of each cell interface, and thereby may be used
+  to build grids that are divided into cells of varying width.
+
+## A complicated example: three-dimensional `RectilinearGrid` with variable spacing via functions
+
+Next we build a grid that is both `Bounded` and stretched in both the `y` and `z` directions.
+The purpose of the stretching is to increase grid resolution near the boundaries.
+We'll do this by using functions to specify the keyword arguments `y` and `z`.
+
+```jldoctest grids
+Nx = Ny = 64
+Nz = 32
+
+Lx = Ly = 1e4
+Lz = 1e3
+
+# Note that j varies from 1 to Ny
+chebychev_spaced_y_faces(j) = Ly * (1 - cos(π * (j - 1) / Ny)) / 2
+
+# Note that k varies from 1 to Nz
+chebychev_spaced_z_faces(k) = - Lz * (1 + cos(π * (k - 1) / Nz)) / 2
+
+grid = RectilinearGrid(size = (Nx, Ny, Nz),
+                       topology = (Periodic, Bounded, Bounded),
+                       x = (0, Lx),
+                       y = chebychev_spaced_y_faces,
+                       z = chebychev_spaced_z_faces)
+
+# output
+64×64×32 RectilinearGrid{Float64, Periodic, Bounded, Bounded} on CPU with 3×3×3 halo
+├── Periodic x ∈ [0.0, 10000.0)  regularly spaced with Δx=156.25
+├── Bounded  y ∈ [0.0, 10000.0]  variably spaced with min(Δy)=6.02272, max(Δy)=245.338
+└── Bounded  z ∈ [-1000.0, -0.0] variably spaced with min(Δz)=2.40764, max(Δz)=49.0086
+```
+
+```@setup plot
+using Oceananigans
+using CairoMakie
+CairoMakie.activate!(type = "svg")
+set_theme!(Theme(fontsize=24))
+
+Nx, Ny, Nz = 64, 64, 32
+Lx, Ly, Lz = 1e4, 1e4, 1e3
+
+chebychev_spaced_y_faces(j) = Ly * (1 - cos(π * (j - 1) / Ny)) / 2
+chebychev_spaced_z_faces(k) = - Lz * (1 + cos(π * (k - 1) / Nz)) / 2
+
+grid = RectilinearGrid(size = (Nx, Ny, Nz),
+                       topology = (Periodic, Bounded, Bounded),
+                       x = (0, Lx),
+                       y = chebychev_spaced_y_faces,
+                       z = chebychev_spaced_z_faces)
+```
+
+We can easily visualize the spacings of ``y`` and ``z`` directions. We can use, e.g.,
+[`ynodes`](@ref) and [`yspacings`](@ref) to extract the positions and spacings of the
+nodes from the grid.
+
+```@example plot
+yc = ynodes(grid, Center())
+zc = znodes(grid, Center())
+
+yf = ynodes(grid, Face())
+zf = znodes(grid, Face())
+
+Δy = yspacings(grid, Center())
+Δz = zspacings(grid, Center())
+
+using CairoMakie
+
+fig = Figure(size=(1200, 1200))
+
+axy = Axis(fig[1, 1], title="y-grid")
+lines!(axy, [0, Ly], [0, 0], color=:gray)
+scatter!(axy, yf, 0 * yf, marker=:vline, color=:gray, markersize=20)
+scatter!(axy, yc, 0 * yc)
+hidedecorations!(axy)
+hidespines!(axy)
+
+axΔy = Axis(fig[2, 1]; xlabel = "y (m)", ylabel = "y-spacing (m)")
+scatter!(axΔy, yc, Δy)
+hidespines!(axΔy, :t, :r) 
+
+axz = Axis(fig[3, 1], title="z-grid")
+lines!(axz, [-Lz, 0], [0, 0], color=:gray)
+scatter!(axz, zf, 0 * zf, marker=:vline, color=:gray, markersize=20)
+scatter!(axz, zc, 0 * zc)
+hidedecorations!(axz)
+hidespines!(axz)
+
+axΔz = Axis(fig[4, 1]; xlabel = "z (m)", ylabel = "z-spacing (m)")
+scatter!(axΔz, zc, Δz)
+hidespines!(axΔz, :t, :r)
+
+rowsize!(fig.layout, 1, Relative(0.1))
+rowsize!(fig.layout, 3, Relative(0.1))
+
+current_figure()
+```
+
+## Inspecting `LatitudeLongitudeGrid` cell spacings
+
+```@setup latlon_nodes
+using Oceananigans
+```
+
+```@example latlon_nodes
+grid = LatitudeLongitudeGrid(size = (1, 44),
+                             longitude = (0, 1),   
+                             latitude = (0, 88),
+                             topology = (Bounded, Bounded, Flat))
+
+φ = φnodes(grid, Center())
+Δx = xspacings(grid, Center(), Center())
+
+using CairoMakie
+
+fig = Figure(size=(600, 400))
+ax = Axis(fig[1, 1], xlabel="Zonal spacing on 2 degree grid (km)", ylabel="Latitude (degrees)")
+scatter!(ax, Δx ./ 1e3, φ)
+
+current_figure()
+```
+
+![](plot_lat_lon_spacings.svg)
+
+## `LatitudeLongitudeGrid` with variable spacing
+
+The syntax for building a grid with variably-spaced cells is the same as for `RectilinearGrid`.
+In our next example, we use a function to build a Mercator grid with a spacing of 2 degrees at
+the equator,
+
+```jldoctest latlon_nodes
+# Mercator scale factor
+scale_factor(φ) = 1 / cosd(φ)
+
+# Compute cell interfaces with Mercator spacing
+m = 2 # spacing at the equator in degrees
+function latitude_faces(j)
+    if j == 1 # equator
+        return 0
+    else # crudely estimate the location of the jth face 
+        φ₋ = latitude_faces(j-1)
+        φ′ = φ₋ + m * scale_factor(φ₋) / 2
+        return φ₋ + m * scale_factor(φ′)
+    end
+end
+
+Lx = 360
+Nx = Int(Lx / m)
+Ny = findfirst(latitude_faces.(1:Nx) .> 90) - 2
+
+grid = LatitudeLongitudeGrid(size = (Nx, Ny),
+                             longitude = (0, Lx),
+                             latitude = latitude_faces,
+                             topology = (Bounded, Bounded, Flat))
+
+# output
+180×28×1 LatitudeLongitudeGrid{Float64, Bounded, Bounded, Flat} on CPU with 3×3×0 halo and with precomputed metrics
+├── longitude: Bounded  λ ∈ [0.0, 360.0]   regularly spaced with Δλ=2.0
+├── latitude:  Bounded  φ ∈ [0.0, 77.2679] variably spaced with min(Δφ)=2.0003, max(Δφ)=6.95319
+└── z:         Flat z                      
+```
+
+We've also illustrated the construction of a grid that is `Flat` in the vertical direction.
+Now let's plot the metrics for this grid,
+
+```@setup plot
+# Mercator scale factor
+scale_factor(φ) = 1 / cosd(φ)
+
+# Compute cell interfaces with Mercator spacing
+m = 2 # spacing at the equator in degrees
+function latitude_faces(j)
+    if j == 1 # equator
+        return 0
+    else # crudely estimate the location of the jth face 
+        φ₋ = latitude_faces(j-1)
+        φ′ = φ₋ + m * scale_factor(φ₋) / 2
+        return φ₋ + m * scale_factor(φ′)
+    end
+end
+
+Lx = 360
+Nx = Int(Lx / m)
+
+# Deduce number of cells south of 90ᵒN
+λf = latitude_faces.(1:Nx)
+Ny = findfirst(λf .> 90) - 2
+
+grid = LatitudeLongitudeGrid(size = (Nx, Ny),
+                             longitude = (0, Lx),
+                             latitude = latitude_faces,
+                             topology = (Bounded, Bounded, Flat))
+```
+
+```@example plot
+φ = φnodes(grid, Center())
+Δx = xspacings(grid, Center(), Center(), with_halos=true)[1:Ny]
+Δy = yspacings(grid, Center())[1:Ny]
+
+using CairoMakie
+
+fig = Figure(size=(800, 400), title="Spacings on a Mercator grid")
+axx = Axis(fig[1, 1], xlabel="Zonal spacing (km)", ylabel="Latitude (degrees)")
+scatter!(axx, Δx ./ 1e3, φ)
+
+axy = Axis(fig[1, 2], xlabel="Meridional spacing (km)")
+scatter!(axy, Δy ./ 1e3, φ)
+
+hidespines!(axx, :t, :r)
+hidespines!(axy, :t, :l, :r)
+hideydecorations!(axy, grid=false)
+
+current_figure()
+```
+
+## Single-precision `RectilinearGrid`
+
+To build a grid whose fields are represented with single-precision floating point values,
+we specify the `float_type` argument along with the (optional) `architecture` argument,
+
+```jldoctest grids
+architecture = CPU()
+float_type = Float32
+
+grid = RectilinearGrid(architecture, float_type,
+                       topology = (Periodic, Periodic, Bounded),
+                       size = (16, 8, 4),
+                       x = (0, 64),
+                       y = (0, 32),
+                       z = (0, 8))
+
+# output
+16×8×4 RectilinearGrid{Float32, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
+├── Periodic x ∈ [0.0, 64.0) regularly spaced with Δx=4.0
+├── Periodic y ∈ [0.0, 32.0) regularly spaced with Δy=4.0
+└── Bounded  z ∈ [0.0, 8.0]  regularly spaced with Δz=2.0
+```
+
+!!! warn "Using single precision"
+    Single precision should be used with care.
+    Users interested in performing single-precision simulations should get in touch via
+    [Discussions](https://github.com/CliMA/Oceananigans.jl/discussions),
+    and should subject their work to extensive testing and validation.
+
+For more examples see [`RectilinearGrid`](@ref Oceananigans.Grids.RectilinearGrid)
+and [`LatitudeLongitudeGrid`](@ref Oceananigans.Grids.LatitudeLongitudeGrid).
diff --git a/docs/src/index.md b/docs/src/index.md
index 20fa6bc624..2bbe707aa4 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -79,29 +79,39 @@ cite our work and mention Oceananigans by name.
 
 If you have work using Oceananigans that you would like to have listed here, please open a pull request to add it or let us know!
 
-1. Whitley V. and Wenegrat, J. O. (2024) [Breaking internal waves on sloping topography: connecting parcel displacements to overturn size, interior-boundary exchanges, and mixing](https://doi.org/10.31223/X5PM5Q) _Earth Arxiv_. DOI:[10.31223/X5PM5Q](https://doi.org/10.31223/X5PM5Q)
+1. Chor, T. and Wenegrat, J. (2024). [The turbulent dynamics of anticyclonic submesoscale headland wakes](https://doi.org/10.31223/X5570C), _Earth arXiv_, DOI: [10.31223/X5570C](https://doi.org/10.31223/X5570C)
 
-1. Chen S., Strong-Wright J., and Taylor, J. R. (2024) [Modeling carbon dioxide removal via sinking of particulate organic carbon from macroalgae cultivation](https://doi.org/10.3389/fmars.2024.1359614) _Frontiers in Marine Science_, **11**, 1359614. DOI: [10.3389/fmars.2024.1359614](https://doi.org/10.3389/fmars.2024.1359614)
+1. Wagner, G. L., Hillier, A., Constantinou, N. C., Silvestri, S., Souza, A., Burns, K., Hill, C., Campin, J.-M., Marshall, J., and Ferrari, R. (2024). [Formulation and calibration of CATKE, a one-equation parameterization for microscale ocean mixing](https://doi.org/10.48550/arXiv.2306.13204), _arXiv_, arXiv:2306.13204. DOI: [10.48550/arXiv.2306.13204](https://doi.org/10.48550/arXiv.2306.13204)
 
-1. Gupta, M., Gürcan, E., and Thompson, A. F. (2024). [Eddy-induced dispersion of sea ice floes at the marginal ice zone](https://doi.org/10.1029/2023GL105656) _Geophysical Research Letters_, **51**, e2023GL105656. DOI: [10.1029/2023GL105656](https://doi.org/10.1029/2023GL105656)
+1. Allred, T., Li, X., Wiersdorf, A., Greenman, B., and Gopalakrishnan, G. (2024). [FlowFPX: Nimble tools for debugging floating-point exceptions](https://doi.org/10.48550/arXiv.2403.15632), _arXiv_, arXiv:2403.15632. DOI: [10.48550/arXiv.2403.15632](https://doi.org/10.48550/arXiv.2403.15632)
 
-1. Wagner, G. L., Pizzo, N. E., Lenain, L., and Veron, F. (2023) [Transition to turbulence in wind-drift layers](https://doi.org/10.1017/jfm.2023.920) _Journal of Fluid Mechanics_, **976**, A8. DOI: [10.1017/jfm.2023.920](https://doi.org/10.1017/jfm.2023.920)
+1. Silvestri, S., Wagner, G. L., Constantinou, N. C., Hill, C., Campin, J.-M., Souza, A., Bishnu, S., Churavy, V., Marshall, J., and Ferrari, R. (2024) [A GPU-based ocean dynamical core for routine mesoscale-resolving climate simulations](https://doi.org/10.22541/essoar.171708158.82342448/v1), _ESS Open Archive_. DOI: [10.22541/essoar.171708158.82342448/v1](https://doi.org/10.22541/essoar.171708158.82342448/v1)
 
-1. Jiménez-Urias, M. A. and Haine T. W. N. (2023) [On the non-self-adjoint and multiscale character of passive scalar mixing under laminar advection](https://doi.org/10.1017/jfm.2023.748) _Journal of Fluid Mechanics_, **973**, A44. DOI: [10.1017/jfm.2023.748](https://doi.org/10.1017/jfm.2023.748)
+1. Silvestri, S., Wagner, G. L., Campin, J.-M., Constantinou, N. C., Hill, C., Souza, A., and Ferrari, R. (2024). [A new WENO-based momentum advection scheme for simulations of ocean mesoscale turbulence](https://doi.org/10.22541/essoar.170110657.76489860/v2), _ESS Open Archive_. DOI: [10.22541/essoar.170110657.76489860/v2](https://doi.org/10.22541/essoar.170110657.76489860/v2)
 
-1. Strong-Wright, J, Chen, S., Constantinou, N. C., Silvestri, S., Wagner, G. L., and Taylor, J. R. (2023). [OceanBioME.jl: A flexible environment for modelling the coupled interactions between ocean biogeochemistry and physics](https://doi.org/10.21105/joss.05669) _Journal of Open Source Software_, **90(8)**, 5669. DOI: [10.21105/joss.05669](https://doi.org/10.21105/joss.05669)
+1. Whitley V. and Wenegrat, J. O. (2024) [Breaking internal waves on sloping topography: connecting parcel displacements to overturn size, interior-boundary exchanges, and mixing](https://doi.org/10.31223/X5PM5Q), _Earth Arxiv_. DOI: [10.31223/X5PM5Q](https://doi.org/10.31223/X5PM5Q)
 
-1. Wagner, G. L., Hillier, A., Constantinou, N. C., Silvestri, S., Souza, A., Burns, K., Ramadhan, A., Hill, C., Campin, J.-M., Marshall, J., and Ferrari, R. (2023). [CATKE: a turbulent-kinetic-energy-based parameterization for ocean microturbulence with dynamic convective adjustment](https://doi.org/10.22541/essoar.168748454.49694722/v1) _ESS Open Archive_. DOI: [10.22541/essoar.168748454.49694722/v1](https://doi.org/10.22541/essoar.168748454.49694722/v1)
+1. Chen S., Strong-Wright J., and Taylor, J. R. (2024) [Modeling carbon dioxide removal via sinking of particulate organic carbon from macroalgae cultivation](https://doi.org/10.3389/fmars.2024.1359614), _Frontiers in Marine Science_, **11**, 1359614. DOI: [10.3389/fmars.2024.1359614](https://doi.org/10.3389/fmars.2024.1359614)
 
-1. Ramadhan, A., Marshall, J. C., Souza, A. N., Lee, X. K., Piterbarg, U., Hillier, A., Wagner, G. L., Rackauckas, C., Hill, C., Campin, J.-M., and Ferrari, R. (2022). [Capturing missing physics in climate model parameterizations using neural differential equations](https://doi.org/10.1002/essoar.10512533.1) _ESS Open Archive_. DOI: [10.1002/essoar.10512533.1](https://doi.org/10.1002/essoar.10512533.1)
+1. Gupta, M., Gürcan, E., and Thompson, A. F. (2024). [Eddy-induced dispersion of sea ice floes at the marginal ice zone](https://doi.org/10.1029/2023GL105656), _Geophysical Research Letters_, **51**, e2023GL105656. DOI: [10.1029/2023GL105656](https://doi.org/10.1029/2023GL105656)
 
-1. Gupta, M. and Thompson, A. F. (2022). [Regimes of sea-ice floe melt: Ice-ocean coupling at the submesoscales](https://doi.org/10.1029/2022JC018894) _Journal of Geophysical Research: Oceans_, **127**, e2022JC018894. DOI: [10.1029/2022JC018894](https://doi.org/10.1029/2022JC018894)
+1. Wagner, G. L., Pizzo, N. E., Lenain, L., and Veron, F. (2023) [Transition to turbulence in wind-drift layers](https://doi.org/10.1017/jfm.2023.920), _Journal of Fluid Mechanics_, **976**, A8. DOI: [10.1017/jfm.2023.920](https://doi.org/10.1017/jfm.2023.920)
 
-1. Simoes-Sousa, I. T., Tandon, A., Pereira, F., Lazaneo, C. Z., and Mahadevan, A. (2022). [Mixed layer eddies supply nutrients to enhance the spring phytoplankton bloom](https://doi.org/10.3389/fmars.2022.825027) _Frontiers in Marine Sciences_, **9**, 825027. DOI: [10.3389/fmars.2022.825027](https://doi.org/10.3389/fmars.2022.825027)
+1. Jiménez-Urias, M. A. and Haine T. W. N. (2023) [On the non-self-adjoint and multiscale character of passive scalar mixing under laminar advection](https://doi.org/10.1017/jfm.2023.748), _Journal of Fluid Mechanics_, **973**, A44. DOI: [10.1017/jfm.2023.748](https://doi.org/10.1017/jfm.2023.748)
 
-1. Chor, T., Wenegrat, J. O., and Taylor, J. (2022). [Insights into the mixing efficiency of submesoscale Centrifugal-Symmetric instabilities.](https://doi.org/10.1175/JPO-D-21-0259.1) _Journal of Physical Oceanography_, **52(10)**, 2273-2287. DOI: [10.1175/JPO-D-21-0259.1](https://doi.org/10.1175/JPO-D-21-0259.1)
+1. Strong-Wright, J, Chen, S., Constantinou, N. C., Silvestri, S., Wagner, G. L., and Taylor, J. R. (2023). [OceanBioME.jl: A flexible environment for modelling the coupled interactions between ocean biogeochemistry and physics](https://doi.org/10.21105/joss.05669), _Journal of Open Source Software_, **90(8)**, 5669. DOI: [10.21105/joss.05669](https://doi.org/10.21105/joss.05669)
 
-1. Bire, S., Kang, W., Ramadhan, A., Campin, J.-M., and Marshall, J. (2022). [Exploring ocean circulation on icy moons heated from below.](https://doi.org/10.1029/2021JE007025) _Journal of Geophysical Research: Planets_, **127**, e2021JE007025. DOI: [10.1029/2021JE007025](https://doi.org/10.1029/2021JE007025)
+1. Ramadhan, A., Marshall, J. C., Souza, A. N., Lee, X. K., Piterbarg, U., Hillier, A., Wagner, G. L., Rackauckas, C., Hill, C., Campin, J.-M., and Ferrari, R. (2022). [Capturing missing physics in climate model parameterizations using neural differential equations](https://doi.org/10.1002/essoar.10512533.1), _ESS Open Archive_. DOI: [10.1002/essoar.10512533.1](https://doi.org/10.1002/essoar.10512533.1)
+
+1. Gupta, M. and Thompson, A. F. (2022). [Regimes of sea-ice floe melt: Ice-ocean coupling at the submesoscales](https://doi.org/10.1029/2022JC018894), _Journal of Geophysical Research: Oceans_, **127**, e2022JC018894. DOI: [10.1029/2022JC018894](https://doi.org/10.1029/2022JC018894)
+
+1. Simoes-Sousa, I. T., Tandon, A., Pereira, F., Lazaneo, C. Z., and Mahadevan, A. (2022). [Mixed layer eddies supply nutrients to enhance the spring phytoplankton bloom](https://doi.org/10.3389/fmars.2022.825027), _Frontiers in Marine Sciences_, **9**, 825027. DOI: [10.3389/fmars.2022.825027](https://doi.org/10.3389/fmars.2022.825027)
+
+1. Chor, T., Wenegrat, J. O., and Taylor, J. (2022). [Insights into the mixing efficiency of submesoscale Centrifugal-Symmetric instabilities.](https://doi.org/10.1175/JPO-D-21-0259.1), _Journal of Physical Oceanography_, **52(10)**, 2273-2287. DOI: [10.1175/JPO-D-21-0259.1](https://doi.org/10.1175/JPO-D-21-0259.1)
+
+1. Bire, S., Kang, W., Ramadhan, A., Campin, J.-M., and Marshall, J. (2022). [Exploring ocean circulation on icy moons heated from below.](https://doi.org/10.1029/2021JE007025), _Journal of Geophysical Research: Planets_, **127**, e2021JE007025. DOI: [10.1029/2021JE007025](https://doi.org/10.1029/2021JE007025)
+
+1. Rackauckas, C., Ma, Y., Martensen, J., Warner, C., Zubov, K., Supekar, R., Skinner, D., Ramadhan, A., and Edelman, A. (2021) [Universal differential equations for scientific machine learning](https://doi.org/10.48550/arXiv.2001.04385), _arXiv_, arXiv.2001.04385. DOI: [10.48550/arXiv.2001.04385](https://doi.org/10.48550/arXiv.2001.04385)
 
 1. Coakley, S., Miles, T. N., Glenn, S., and Lim, H. S. (2021). [Observation-Large eddy simulation comparison of ocean mixing under Typhoon Soulik (2018)](https://doi.org/10.23919/OCEANS44145.2021.9705670), _OCEANS 2021: San Diego – Porto, 2021_, pp. 1-7. DOI: [10.23919/OCEANS44145.2021.9705670](https://doi.org/10.23919/OCEANS44145.2021.9705670)
 
@@ -111,6 +121,6 @@ If you have work using Oceananigans that you would like to have listed here, ple
 
 1. Buffett, B. A. (2021). [Conditions for turbulent Ekman layers in precessionally driven flow](https://doi.org/10.1093/gji/ggab088), _Geophysical Journal International_, **226(1)**, 56–65. DOI: [10.1093/gji/ggab088](https://doi.org/10.1093/gji/ggab088)
 
-1. Bhamidipati, N., Souza, A.N., and Flierl, G.R. (2020). [Turbulent mixing of a passive scalar in the ocean mixed layer](https://doi.org/10.1016/j.ocemod.2020.101615). _Ocean Modelling_, **149**, 101615. DOI: [10.1016/j.ocemod.2020.101615](https://doi.org/10.1016/j.ocemod.2020.101615)
+1. Bhamidipati, N., Souza, A.N., and Flierl, G.R. (2020). [Turbulent mixing of a passive scalar in the ocean mixed layer](https://doi.org/10.1016/j.ocemod.2020.101615), _Ocean Modelling_, **149**, 101615. DOI: [10.1016/j.ocemod.2020.101615](https://doi.org/10.1016/j.ocemod.2020.101615)
 
-1. Souza, A. N., Wagner, G. L., Ramadhan, A., Allen, B., Churavy, V., Schloss, J., Campin, J. M., Hill, C., Edelman, A., Marshall, J., Flierl, G., and Ferrari, R. (2020). [Uncertainty quantification of ocean parameterizations: Application to the K‐Profile‐Parameterization for penetrative convection](https://doi.org/10.1029/2020MS002108). _Journal of Advances in Modeling Earth Systems_, **12**, e2020MS002108. DOI: [10.1029/2020MS002108](https://doi.org/10.1029/2020MS002108)
+1. Souza, A. N., Wagner, G. L., Ramadhan, A., Allen, B., Churavy, V., Schloss, J., Campin, J. M., Hill, C., Edelman, A., Marshall, J., Flierl, G., and Ferrari, R. (2020). [Uncertainty quantification of ocean parameterizations: Application to the K‐Profile‐Parameterization for penetrative convection](https://doi.org/10.1029/2020MS002108), _Journal of Advances in Modeling Earth Systems_, **12**, e2020MS002108. DOI: [10.1029/2020MS002108](https://doi.org/10.1029/2020MS002108)
diff --git a/docs/src/model_setup/background_fields.md b/docs/src/model_setup/background_fields.md
index d9cbac2493..428ce03599 100644
--- a/docs/src/model_setup/background_fields.md
+++ b/docs/src/model_setup/background_fields.md
@@ -59,7 +59,7 @@ model.background_fields.velocities.u
 FunctionField located at (Face, Center, Center)
 ├── func: U (generic function with 1 method)
 ├── grid: 1×1×1 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 1×1×1 halo
-├── clock: Clock(time=0 seconds, iteration=0, last_Δt=Inf days)
+├── clock: Clock{Float64, Float64}(time=0 seconds, iteration=0, last_Δt=Inf days)
 └── parameters: nothing
 ```
 
@@ -102,6 +102,6 @@ model.background_fields.tracers.b
 FunctionField located at (Center, Center, Center)
 ├── func: B (generic function with 1 method)
 ├── grid: 1×1×1 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 1×1×1 halo
-├── clock: Clock(time=0 seconds, iteration=0, last_Δt=Inf days)
+├── clock: Clock{Float64, Float64}(time=0 seconds, iteration=0, last_Δt=Inf days)
 └── parameters: (α = 3.14, N = 1.0, f = 0.1)
 ```
diff --git a/docs/src/model_setup/boundary_conditions.md b/docs/src/model_setup/boundary_conditions.md
index 0f73bbd614..a8c0656012 100644
--- a/docs/src/model_setup/boundary_conditions.md
+++ b/docs/src/model_setup/boundary_conditions.md
@@ -42,7 +42,7 @@ julia> no_slip_field_bcs = FieldBoundaryConditions(no_slip_bc);
 julia> model = NonhydrostaticModel(; grid, boundary_conditions=(u=no_slip_field_bcs, v=no_slip_field_bcs, w=no_slip_field_bcs))
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 16×16×16 RectilinearGrid{Float64, Periodic, Bounded, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: ()
 ├── closure: Nothing
@@ -92,8 +92,8 @@ julia> model.velocities.v.boundary_conditions
 Oceananigans.FieldBoundaryConditions, with boundary conditions
 ├── west: PeriodicBoundaryCondition
 ├── east: PeriodicBoundaryCondition
-├── south: OpenBoundaryCondition: Nothing
-├── north: OpenBoundaryCondition: Nothing
+├── south: OpenBoundaryCondition{Nothing}: Nothing
+├── north: OpenBoundaryCondition{Nothing}: Nothing
 ├── bottom: ValueBoundaryCondition: 0.0
 ├── top: FluxBoundaryCondition: Nothing
 └── immersed: FluxBoundaryCondition: Nothing
@@ -331,7 +331,7 @@ FluxBoundaryCondition: DiscreteBoundaryFunction with filtered_drag
 ### 8. Discrete-form boundary condition with parameters
 
 ```jldoctest
-julia> Cd = 0.2;  # drag coefficient
+julia> Cd = 0.2; # drag coefficient
 
 julia> @inline linear_drag(i, j, grid, clock, model_fields, Cd) = @inbounds - Cd * model_fields.u[i, j, 1];
 
@@ -404,7 +404,7 @@ julia> c_bcs = FieldBoundaryConditions(top = ValueBoundaryCondition(20.0),
 julia> model = NonhydrostaticModel(grid=grid, boundary_conditions=(u=u_bcs, c=c_bcs), tracers=:c)
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 16×16×16 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: c
 ├── closure: Nothing
@@ -436,31 +436,25 @@ top and bottom of both `model.velocities.u` and `model.tracers.c`.
 Immersed boundary conditions are supported experimentally. A no-slip boundary condition is specified
 with
 
-```jldoctest; filter = r".*@ Oceananigans.ImmersedBoundaries.*"
-julia> underlying_grid = RectilinearGrid(size=(32, 32, 16), x=(-3, 3), y=(-3, 3), z=(0, 1), topology=(Periodic, Periodic, Bounded));
-
-julia> hill(x, y) = 0.1 + 0.1 * exp(-x^2 - y^2)
-hill (generic function with 1 method)
+```@meta
+DocTestFilters = r"┌ Warning:[\s\S]*\.jl:[0-9]*"
+```
 
-julia> grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(hill))
-32×32×16 ImmersedBoundaryGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo:
-├── immersed_boundary: GridFittedBottom(mean(z)=0.108726, min(z)=0.1, max(z)=0.198258)
-├── underlying_grid: 32×32×16 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── Periodic x ∈ [-3.0, 3.0) regularly spaced with Δx=0.1875
-├── Periodic y ∈ [-3.0, 3.0) regularly spaced with Δy=0.1875
-└── Bounded  z ∈ [0.0, 1.0]  regularly spaced with Δz=0.0625
+```julia
+# Generate a simple ImmersedBoundaryGrid
+hill(x, y) = 0.1 + 0.1 * exp(-x^2 - y^2)
+underlying_grid = RectilinearGrid(size=(32, 32, 16), x=(-3, 3), y=(-3, 3), z=(0, 1), topology=(Periodic, Periodic, Bounded))
+grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(hill))
 
-julia> velocity_bcs = FieldBoundaryConditions(immersed=ValueBoundaryCondition(0.0));
+# Create a no-slip boundary condition for velocity fields.
+# Note that the no-slip boundary condition is _only_ applied on immersed boundaries.
+velocity_bcs = FieldBoundaryConditions(immersed=ValueBoundaryCondition(0))
+model = NonhydrostaticModel(; grid, boundary_conditions=(u=velocity_bcs, v=velocity_bcs, w=velocity_bcs))
 
-julia> model = NonhydrostaticModel(; grid, boundary_conditions=(u=velocity_bcs, v=velocity_bcs, w=velocity_bcs));
-┌ Warning: `ImmersedBoundaryCondition` is experimental.
-└ @ Oceananigans.ImmersedBoundaries ~/Oceananigans.jl/src/ImmersedBoundaries/immersed_boundary_condition.jl:54
-┌ Warning: `ImmersedBoundaryCondition` is experimental.
-└ @ Oceananigans.ImmersedBoundaries ~/Oceananigans.jl/src/ImmersedBoundaries/immersed_boundary_condition.jl:54
-┌ Warning: `ImmersedBoundaryCondition` is experimental.
-└ @ Oceananigans.ImmersedBoundaries ~/Oceananigans.jl/src/ImmersedBoundaries/immersed_boundary_condition.jl:54
+# Insepct the boundary condition on the vertical velocity:
+model.velocities.w.boundary_conditions.immersed
 
-julia> model.velocities.w.boundary_conditions.immersed
+# output
 ImmersedBoundaryCondition:
 ├── west: ValueBoundaryCondition: 0.0
 ├── east: ValueBoundaryCondition: 0.0
@@ -470,23 +464,36 @@ ImmersedBoundaryCondition:
 └── top: Nothing
 ```
 
+!!! warning "`NonhydrostaticModel` on `ImmersedBoundaryGrid`"
+    The pressure solver for `NonhydrostaticModel` is approximate, and is unable to produce
+    a velocity field that is simultaneously divergence-free while also satisfying impenetrability
+    on the immersed boundary. As a result, simulated dynamics with `NonhydrostaticModel` can
+    exhibit egregiously unphysical errors and should be interpreted with caution.
+
 An `ImmersedBoundaryCondition` encapsulates boundary conditions on each potential boundary-facet
 of a boundary-adjacent cell. Boundary conditions on specific faces of immersed-boundary-adjacent
 cells may also be specified by manually building an `ImmersedBoundaryCondition`:
 
-```jldoctest; filter = r".*@ Oceananigans.ImmersedBoundaries.*"
-julia> bottom_drag_bc = ImmersedBoundaryCondition(bottom=ValueBoundaryCondition(0.0))
-┌ Warning: `ImmersedBoundaryCondition` is experimental.
-└ @ Oceananigans.ImmersedBoundaries ~/Oceananigans.jl/src/ImmersedBoundaries/immersed_boundary_condition.jl:54
+```julia
+bottom_drag_bc = ImmersedBoundaryCondition(bottom=ValueBoundaryCondition(0))
+
+# output
 ImmersedBoundaryCondition:
 ├── west: Nothing
 ├── east: Nothing
 ├── south: Nothing
 ├── north: Nothing
-├── bottom: ValueBoundaryCondition: 0.0
+├── bottom: ValueBoundaryCondition: 0
 └── top: Nothing
+```
+
+The `ImmersedBoundaryCondition` may then be incorporated into the boundary conditions for a
+`Field` by prescribing it to the `immersed` boundary label,
+
+```julia
+velocity_bcs = FieldBoundaryConditions(immersed=bottom_drag_bc)
 
-julia> velocity_bcs = FieldBoundaryConditions(immersed=bottom_drag_bc)
+# output
 Oceananigans.FieldBoundaryConditions, with boundary conditions
 ├── west: DefaultBoundaryCondition (FluxBoundaryCondition: Nothing)
 ├── east: DefaultBoundaryCondition (FluxBoundaryCondition: Nothing)
@@ -512,27 +519,24 @@ of the underlying grid.
 
 First we create the boundary condition for the grid's bottom:
 
-```jldoctest immersed_bc
-julia> @inline linear_drag(x, y, t, u) = - 0.2 * u
-linear_drag (generic function with 1 method)
+```julia
+@inline linear_drag(x, y, t, u) = - 0.2 * u
+drag_u = FluxBoundaryCondition(linear_drag, field_dependencies=:u)
 
-julia> drag_u = FluxBoundaryCondition(linear_drag, field_dependencies=:u)
+# output
 FluxBoundaryCondition: ContinuousBoundaryFunction linear_drag at (Nothing, Nothing, Nothing)
 ```
 
 Next, we create the immersed boundary condition by adding the argument `z` to `linear_drag`
 and imposing drag only on "bottom" facets of cells that neighbor immersed cells:
 
-```jldoctest immersed_bc; filter = r".*@ Oceananigans.ImmersedBoundaries.*"
-julia> @inline immersed_linear_drag(x, y, z, t, u) = - 0.2 * u
-immersed_linear_drag (generic function with 1 method)
+```julia
+@inline immersed_linear_drag(x, y, z, t, u) = - 0.2 * u
+immersed_drag_u = FluxBoundaryCondition(immersed_linear_drag, field_dependencies=:u)
 
-julia> immersed_drag_u = FluxBoundaryCondition(immersed_linear_drag, field_dependencies=:u)
-FluxBoundaryCondition: ContinuousBoundaryFunction immersed_linear_drag at (Nothing, Nothing, Nothing)
+u_immersed_bc = ImmersedBoundaryCondition(bottom = immersed_drag_u)
 
-julia> u_immersed_bc = ImmersedBoundaryCondition(bottom = immersed_drag_u)
-┌ Warning: `ImmersedBoundaryCondition` is experimental.
-└ @ Oceananigans.ImmersedBoundaries ~/Oceananigans.jl/src/ImmersedBoundaries/immersed_boundary_condition.jl:54
+# output
 ImmersedBoundaryCondition:
 ├── west: Nothing
 ├── east: Nothing
@@ -544,8 +548,10 @@ ImmersedBoundaryCondition:
 
 Finally, we combine the two:
 
-```jldoctest immersed_bc
-julia> u_bcs = FieldBoundaryConditions(bottom = drag_u, immersed = u_immersed_bc)
+```julia
+u_bcs = FieldBoundaryConditions(bottom = drag_u, immersed = u_immersed_bc)
+
+# output
 Oceananigans.FieldBoundaryConditions, with boundary conditions
 ├── west: DefaultBoundaryCondition (FluxBoundaryCondition: Nothing)
 ├── east: DefaultBoundaryCondition (FluxBoundaryCondition: Nothing)
@@ -560,3 +566,8 @@ Oceananigans.FieldBoundaryConditions, with boundary conditions
     Note the difference between the arguments required for the function within the `bottom` boundary
     condition versus the arguments for the function within the `immersed` boundary condition. E.g.,
     `x, y, t` in `linear_drag()` versus `x, y, z, t` in `immersed_linear_drag()`.
+
+```@meta
+DocTestFilters = nothing
+```
+
diff --git a/docs/src/model_setup/buoyancy_and_equation_of_state.md b/docs/src/model_setup/buoyancy_and_equation_of_state.md
index 3dec28b35c..c547374072 100644
--- a/docs/src/model_setup/buoyancy_and_equation_of_state.md
+++ b/docs/src/model_setup/buoyancy_and_equation_of_state.md
@@ -28,7 +28,7 @@ julia> grid = RectilinearGrid(size=(8, 8, 8), extent=(1, 1, 1));
 julia> model = NonhydrostaticModel(; grid, buoyancy=nothing)
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 8×8×8 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: ()
 ├── closure: Nothing
@@ -43,7 +43,7 @@ The option `buoyancy = nothing` is the default for [`NonhydrostaticModel`](@ref)
 julia> model = NonhydrostaticModel(; grid)
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 8×8×8 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: ()
 ├── closure: Nothing
@@ -51,12 +51,10 @@ NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 └── coriolis: Nothing
 ```
 
-To create a `HydrostaticFreeSurfaceModel` without a buoyancy term we explicitly
-specify `buoyancy = nothing`. The default tracers `T` and `S` for `HydrostaticFreeSurfaceModel`
-may be eliminated when `buoyancy = nothing` by specifying `tracers = ()`:
+The same is true for `HydrostaticFreeSurfaceModel`,
 
 ```jldoctest buoyancy
-julia> model = HydrostaticFreeSurfaceModel(; grid, buoyancy=nothing, tracers=())
+julia> model = HydrostaticFreeSurfaceModel(; grid)
 HydrostaticFreeSurfaceModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 8×8×8 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
 ├── timestepper: QuasiAdamsBashforth2TimeStepper
@@ -79,7 +77,7 @@ a buoyancy tracer by including `:b` in `tracers` and specifying  `buoyancy = Buo
 julia> model = NonhydrostaticModel(; grid, buoyancy=BuoyancyTracer(), tracers=:b)
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 8×8×8 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: b
 ├── closure: Nothing
@@ -99,7 +97,7 @@ HydrostaticFreeSurfaceModel{CPU, RectilinearGrid}(time = 0 seconds, iteration =
 ├── buoyancy: BuoyancyTracer with ĝ = NegativeZDirection()
 ├── free surface: ImplicitFreeSurface with gravitational acceleration 9.80665 m s⁻²
 │   └── solver: FFTImplicitFreeSurfaceSolver
-├── advection scheme: 
+├── advection scheme:
 │   ├── momentum: Centered reconstruction order 2
 │   └── b: Centered reconstruction order 2
 └── coriolis: Nothing
@@ -120,37 +118,18 @@ S.I. units ``\text{m}\,\text{s}^{-2}``) and requires to add `:T` and `:S` as tra
 julia> model = NonhydrostaticModel(; grid, buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 8×8×8 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: (T, S)
 ├── closure: Nothing
 ├── buoyancy: SeawaterBuoyancy with g=9.80665 and LinearEquationOfState(thermal_expansion=0.000167, haline_contraction=0.00078) with ĝ = NegativeZDirection()
 └── coriolis: Nothing
 ```
-With `HydrostaticFreeSurfaceModel`, these are the default choices for `buoyancy` and `tracers` so,
-either including them or not we get:
-
-```jldoctest buoyancy
-julia> model = HydrostaticFreeSurfaceModel(; grid, buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
-HydrostaticFreeSurfaceModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
-├── grid: 8×8×8 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
-├── tracers: (T, S)
-├── closure: Nothing
-├── buoyancy: SeawaterBuoyancy with g=9.80665 and LinearEquationOfState(thermal_expansion=0.000167, haline_contraction=0.00078) with ĝ = NegativeZDirection()
-├── free surface: ImplicitFreeSurface with gravitational acceleration 9.80665 m s⁻²
-│   └── solver: FFTImplicitFreeSurfaceSolver
-├── advection scheme: 
-│   ├── momentum: Centered reconstruction order 2
-│   ├── T: Centered reconstruction order 2
-│   └── S: Centered reconstruction order 2
-└── coriolis: Nothing
-```
 
-is identical to the default,
+and the same is true for `HydrostaticFreeSurfaceModel`,
 
 ```jldoctest buoyancy
-julia> model = HydrostaticFreeSurfaceModel(; grid)
+julia> model = HydrostaticFreeSurfaceModel(; grid, buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
 HydrostaticFreeSurfaceModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 8×8×8 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
 ├── timestepper: QuasiAdamsBashforth2TimeStepper
@@ -159,7 +138,7 @@ HydrostaticFreeSurfaceModel{CPU, RectilinearGrid}(time = 0 seconds, iteration =
 ├── buoyancy: SeawaterBuoyancy with g=9.80665 and LinearEquationOfState(thermal_expansion=0.000167, haline_contraction=0.00078) with ĝ = NegativeZDirection()
 ├── free surface: ImplicitFreeSurface with gravitational acceleration 9.80665 m s⁻²
 │   └── solver: FFTImplicitFreeSurfaceSolver
-├── advection scheme: 
+├── advection scheme:
 │   ├── momentum: Centered reconstruction order 2
 │   ├── T: Centered reconstruction order 2
 │   └── S: Centered reconstruction order 2
@@ -178,7 +157,7 @@ SeawaterBuoyancy{Float64}:
 julia> model = NonhydrostaticModel(; grid, buoyancy, tracers=(:T, :S))
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 8×8×8 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: (T, S)
 ├── closure: Nothing
@@ -245,19 +224,25 @@ we wrap the buoyancy model in
 `Buoyancy()` function call, which takes the keyword arguments `model` and `gravity_unit_vector`,
 
 ```jldoctest buoyancy
+julia> grid = RectilinearGrid(size=(8, 8, 8), extent=(1, 1, 1));
+
 julia> θ = 45; # degrees
 
 julia> g̃ = (0, sind(θ), cosd(θ));
 
-julia> model = NonhydrostaticModel(; grid,
-                                   buoyancy=Buoyancy(model=BuoyancyTracer(), gravity_unit_vector=g̃), 
-                                   tracers=:b)
+julia> buoyancy = Buoyancy(model=BuoyancyTracer(), gravity_unit_vector=g̃)
+Buoyancy:
+├── model: BuoyancyTracer
+└── gravity_unit_vector: (0.0, 0.707107, 0.707107)
+
+julia> model = NonhydrostaticModel(; grid, buoyancy, tracers=:b)
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 8×8×8 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: b
 ├── closure: Nothing
-├── buoyancy: BuoyancyTracer with ĝ = Tuple{Float64, Float64, Float64}
+├── buoyancy: BuoyancyTracer with ĝ = (0.0, 0.707107, 0.707107)
 └── coriolis: Nothing
 ```
+
diff --git a/docs/src/model_setup/clock.md b/docs/src/model_setup/clock.md
index c2d2a2eb2b..8a8b7c388b 100644
--- a/docs/src/model_setup/clock.md
+++ b/docs/src/model_setup/clock.md
@@ -14,7 +14,9 @@ end
 
 ```jldoctest
 julia> clock = Clock(time=0.0)
-Clock{Float64, Float64}: time = 0 seconds, last_Δt = Inf days, iteration = 0, stage = 1
+Clock{Float64, Float64}(time=0 seconds, iteration=0, last_Δt=Inf days)
+├── stage: 1
+└── last_stage_Δt: Inf days
 ```
 
 but can be modified to start the model clock at some other time.
@@ -22,7 +24,9 @@ For example, passing
 
 ```jldoctest
 julia> clock = Clock(time=3600.0)
-Clock{Float64, Float64}: time = 1 hour, last_Δt = Inf days, iteration = 0, stage = 1
+Clock{Float64, Float64}(time=1 hour, iteration=0, last_Δt=Inf days)
+├── stage: 1
+└── last_stage_Δt: Inf days
 ```
 
 to the constructor for `NonhydrostaticModel` causes the simulation
@@ -37,7 +41,9 @@ for example, pass
 julia> using TimesDates
 
 julia> clock = Clock(time=TimeDate(2020))
-Clock{TimesDates.TimeDate, Float64}: time = 2020-01-01T00:00:00, last_Δt = Inf days, iteration = 0, stage = 1
+Clock{TimesDates.TimeDate, Float64}(time=2020-01-01T00:00:00, iteration=0, last_Δt=Inf days)
+├── stage: 1
+└── last_stage_Δt: Inf days
 ```
 
 to `NonhydrostaticModel`. `TimesDates.TimeDate` supports nanosecond resolution and is thus recommended
diff --git a/docs/src/model_setup/forcing_functions.md b/docs/src/model_setup/forcing_functions.md
index 49714b333d..96021b5610 100644
--- a/docs/src/model_setup/forcing_functions.md
+++ b/docs/src/model_setup/forcing_functions.md
@@ -90,8 +90,8 @@ ContinuousForcing{Float64} at (Face, Center, Center)
 ```
 
 In this example, the objects passed to the `parameters` keyword in the construction of
-`u_forcing` and `T_forcing` --- a floating point number for `u_forcing`, and a `NamedTuple`
-of parameters for `T_forcing` --- are passed on to `u_forcing_func` and `T_forcing_func` when
+`u_forcing` and `T_forcing` -- a floating point number for `u_forcing`, and a `NamedTuple`
+of parameters for `T_forcing` -- are passed on to `u_forcing_func` and `T_forcing_func` when
 they are called during time-stepping. The object passed to `parameters` is in principle arbitrary.
 However, if using the GPU, then `typeof(parameters)` may be restricted by the requirements
 of GPU-compiliability.
diff --git a/docs/src/model_setup/lagrangian_particles.md b/docs/src/model_setup/lagrangian_particles.md
index 2db8c9fd94..f7deb6b515 100644
--- a/docs/src/model_setup/lagrangian_particles.md
+++ b/docs/src/model_setup/lagrangian_particles.md
@@ -43,7 +43,7 @@ model = NonhydrostaticModel(grid=grid, particles=lagrangian_particles)
 # output
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 10×10×10 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: ()
 ├── closure: Nothing
@@ -108,17 +108,43 @@ Particle properties can be written to disk using JLD2 or NetCDF.
 
 When writing to JLD2 you can pass `model.particles` as part of the named tuple of outputs.
 
-```julia
-JLD2OutputWriter(model, (particles=model.particles,), prefix="particles", schedule=TimeInterval(15))
+```@meta
+DocTestFilters = r"└── file size: [0-9]*.[0-9]* KiB"
+```
+
+```jldoctest particles
+JLD2OutputWriter(model, (particles=model.particles,), filename="particles", schedule=TimeInterval(15))
+
+# output
+JLD2OutputWriter scheduled on TimeInterval(15 seconds):
+├── filepath: ./particles.jld2
+├── 1 outputs: particles
+├── array type: Array{Float64}
+├── including: [:grid, :coriolis, :buoyancy, :closure]
+├── file_splitting: NoFileSplitting
+└── file size: 17.6 KiB
 ```
 
 When writing to NetCDF you should write particles to a separate file as the NetCDF dimensions differ for
 particle trajectories. You can just pass `model.particles` straight to `NetCDFOutputWriter`:
 
-```julia
-NetCDFOutputWriter(model, model.particles, filepath="particles.nc", schedule=TimeInterval(15))
+```jldoctest particles
+NetCDFOutputWriter(model, model.particles, filename="particles.nc", schedule=TimeInterval(15))
+
+# output
+NetCDFOutputWriter scheduled on TimeInterval(15 seconds):
+├── filepath: ./particles.nc
+├── dimensions: particle_id(10), time(0)
+├── 1 outputs: particles
+└── array type: Array{Float64}
+├── file_splitting: NoFileSplitting
+└── file size: 9.9 KiB
 ```
 
 !!! warn "Outputting custom particle properties to NetCDF"
     NetCDF does not support arbitrary data types. If you need to write custom particle properties to disk
     that are not supported by NetCDF then you should use JLD2 (which should support almost any Julia data type).
+
+```@meta
+DocTestFilters = nothing
+```
diff --git a/docs/src/model_setup/grids.md b/docs/src/model_setup/legacy_grids.md
similarity index 98%
rename from docs/src/model_setup/grids.md
rename to docs/src/model_setup/legacy_grids.md
index dbe23b106d..cece7c0d48 100644
--- a/docs/src/model_setup/grids.md
+++ b/docs/src/model_setup/legacy_grids.md
@@ -4,7 +4,6 @@ The grids currently supported are:
 - `RectilinearGrid`s with either constant or variable grid spacings and
 - `LatitudeLongitudeGrid` on the sphere.
 
-
 ## `RectilinearGrid`
 
 A `RectilinearGrid` is constructed by specifying the `size` of the grid (a `Tuple` specifying
@@ -76,8 +75,8 @@ In that case, the `size` and `extent` are 2-tuples, e.g.,
 ```jldoctest
 julia> grid = RectilinearGrid(topology = (Periodic, Periodic, Flat), size = (32, 32), extent = (10, 20))
 32×32×1 RectilinearGrid{Float64, Periodic, Periodic, Flat} on CPU with 3×3×0 halo
-├── Periodic x ∈ [0.0, 10.0)      regularly spaced with Δx=0.3125
-├── Periodic y ∈ [0.0, 20.0)      regularly spaced with Δy=0.625
+├── Periodic x ∈ [0.0, 10.0) regularly spaced with Δx=0.3125
+├── Periodic y ∈ [0.0, 20.0) regularly spaced with Δy=0.625
 └── Flat z
 ```
 
diff --git a/docs/src/model_setup/number_type.md b/docs/src/model_setup/number_type.md
index f712f617d1..5e83724629 100644
--- a/docs/src/model_setup/number_type.md
+++ b/docs/src/model_setup/number_type.md
@@ -7,7 +7,7 @@ with 64-bit or 32-bit floating point precision.
     When not using `Float64` be careful to not mix different precisions as it could introduce implicit type conversions
     which can negatively effect performance. You can pass the number type desires to many constructors to enforce
     the type you want: e.g. `RectilinearGrid(CPU(), Float32; size=(16, 16, 16), extent=(1, 1, 1))` and
-    `scalarDiffusivity(Float16; κ=1//7, ν=2//7)`.
+    `ScalarDiffusivity(Float16; κ=1//7, ν=2//7)`.
 
 !!! warning "Effect of floating point precision on simulation accuracy"
     While we run many tests with both `Float32` and `Float64` it is not clear whether `Float32` is precise enough to
diff --git a/docs/src/model_setup/output_writers.md b/docs/src/model_setup/output_writers.md
index 98f78cbb07..999eaaaef1 100644
--- a/docs/src/model_setup/output_writers.md
+++ b/docs/src/model_setup/output_writers.md
@@ -71,7 +71,7 @@ NetCDFOutputWriter scheduled on TimeInterval(1 minute):
 ├── 2 outputs: (c, u)
 └── array type: Array{Float64}
 ├── file_splitting: NoFileSplitting
-└── file size: 14.8 KiB
+└── file size: 14.9 KiB
 ```
 
 ```jldoctest netcdf1
@@ -86,7 +86,7 @@ NetCDFOutputWriter scheduled on TimeInterval(1 minute):
 ├── 2 outputs: (c, u)
 └── array type: Array{Float64}
 ├── file_splitting: NoFileSplitting
-└── file size: 14.8 KiB
+└── file size: 14.9 KiB
 ```
 
 ```jldoctest netcdf1
@@ -187,7 +187,7 @@ NetCDFOutputWriter scheduled on IterationInterval(1):
 ├── 1 outputs: u
 └── array type: Array{Float64}
 ├── file_splitting: NoFileSplitting
-└── file size: 14.5 KiB
+└── file size: 14.6 KiB
 ```
 
 See [`NetCDFOutputWriter`](@ref) for more information.
@@ -237,7 +237,7 @@ JLD2OutputWriter scheduled on TimeInterval(20 minutes):
 ├── array type: Array{Float64}
 ├── including: [:grid, :coriolis, :buoyancy, :closure]
 ├── file_splitting: NoFileSplitting
-└── file size: 27.2 KiB
+└── file size: 28.5 KiB
 ```
 
 and a time- and horizontal-average of tracer `c` every 20 minutes of simulation time
@@ -255,7 +255,7 @@ JLD2OutputWriter scheduled on TimeInterval(20 minutes):
 ├── array type: Array{Float64}
 ├── including: [:grid, :coriolis, :buoyancy, :closure]
 ├── file_splitting: NoFileSplitting
-└── file size: 17.3 KiB
+└── file size: 18.3 KiB
 ```
 
 
@@ -312,6 +312,5 @@ JLD2OutputWriter scheduled on TimeInterval(4 days):
 ├── array type: Array{Float64}
 ├── including: [:grid, :coriolis, :buoyancy, :closure]
 ├── file_splitting: NoFileSplitting
-└── file size: 26.5 KiB
+└── file size: 27.6 KiB
 ```
-
diff --git a/docs/src/model_setup/tracers.md b/docs/src/model_setup/tracers.md
index 9ab731a325..947cb6e9ff 100644
--- a/docs/src/model_setup/tracers.md
+++ b/docs/src/model_setup/tracers.md
@@ -15,7 +15,7 @@ julia> grid = RectilinearGrid(size=(16, 16, 16), extent=(1, 1, 1));
 julia> model = NonhydrostaticModel(; grid)
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 16×16×16 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: ()
 ├── closure: Nothing
@@ -30,7 +30,7 @@ For example, to add conservative temperature `T` and absolute salinity `S`:
 julia> model = NonhydrostaticModel(; grid, tracers=(:T, :S))
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 16×16×16 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: (T, S)
 ├── closure: Nothing
@@ -65,7 +65,7 @@ An arbitrary number of tracers may be simulated. For example, to simulate
 julia> model = NonhydrostaticModel(; grid, tracers=(:T, :S, :C₁, :CO₂, :nitrogen))
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 16×16×16 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: (T, S, C₁, CO₂, nitrogen)
 ├── closure: Nothing
diff --git a/docs/src/model_setup/turbulent_diffusivity_closures_and_les_models.md b/docs/src/model_setup/turbulent_diffusivity_closures_and_les_models.md
index e092715667..175a3856c2 100644
--- a/docs/src/model_setup/turbulent_diffusivity_closures_and_les_models.md
+++ b/docs/src/model_setup/turbulent_diffusivity_closures_and_les_models.md
@@ -59,7 +59,7 @@ for all tracers need to be specified.
 
 ## Smagorinsky-Lilly
 
-To use the Smagorinsky-Lilly LES closure, no parameters are required
+To use the default Smagorinsky-Lilly LES closure, we write
 
 ```jldoctest
 julia> using Oceananigans.TurbulenceClosures
@@ -68,8 +68,8 @@ julia> closure = SmagorinskyLilly()
 SmagorinskyLilly: C=0.16, Cb=1.0, Pr=1.0
 ```
 
-although they may be specified. By default, the background viscosity and diffusivity are assumed to
-be the molecular values for seawater. For more details see [`SmagorinskyLilly`](@ref).
+The parameters `C`, `Cb`, and `Pr` may alternatively be specified explicitly.
+For more details see [`SmagorinskyLilly`](@ref).
 
 ## Anisotropic minimum dissipation
 
diff --git a/docs/src/numerical_implementation/boundary_conditions.md b/docs/src/numerical_implementation/boundary_conditions.md
index a3485036e2..ef5b6d08f4 100644
--- a/docs/src/numerical_implementation/boundary_conditions.md
+++ b/docs/src/numerical_implementation/boundary_conditions.md
@@ -123,3 +123,104 @@ boundaries ``\partial \Omega_b``:
    values of ``c``.
 
 Flux boundary conditions are represented by the [`Flux`](@ref) type.
+
+## Open boundary conditions
+
+Open boundary conditions directly specify the value of the halo points. Typically this is used
+to impose no penetration boundary conditions, i.e. setting wall normal velocity components on 
+to zero on the boundary. 
+
+The nuance here is that open boundaries behave differently for fields on face points in the 
+boundary direction due to the [staggered grid](@ref finite_volume). For example, the u-component
+of velocity lies on `(Face, Center, Center)` points so for open `west` or `east` boundaries the 
+point specified by the boundary condition is the point lying on the boundary, where as for a 
+tracer on `(Center, Center, Center)` points the open boundary condition specifies a point outside
+of the domain (hence the difference with `Value` boundary conditions).
+
+The other important detail is that open (including no-penetration) boundary conditions are the 
+only conditions used on wall normal velocities when the domain is not periodic. This means that 
+their value affects the pressure calculation for nonhydrostatic models as it is involved in 
+calculating the divergence in the boundary adjacent center point (as described in the 
+[fractional step method](@ref time_stepping) documentation). Usually boundary points are filled
+for the predictor velocity (i.e. before the pressure is calculated), and on the corrected field
+(i.e. after the pressure correction is applied), but for open boundaries this would result in
+the boundary adjacent center point becoming divergent so open boundaries are only filled for the 
+predictor velocity and stay the same after the pressure correction (so the boundary point is filled
+with the final corrected velocity at the predictor step).
+
+The restriction arrises as the boundary condition is specifying the wall normal velocity, 
+``\hat{\boldsymbol{n}}\cdot\boldsymbol{u}``, which leads to the pressure boundary condition
+```math
+    \begin{equation}
+    \label{eq:pressure_boundary_condition}
+    \Delta t \, \hat{\boldsymbol{n}}\cdot\boldsymbol{\nabla}p^{n+1}\big |_{\partial\Omega} = \left[\Delta t \, \hat{\boldsymbol{n}}\cdot\boldsymbol{u}^\star - \hat{\boldsymbol{n}}\cdot\boldsymbol{u}^{n+1}\right],
+    \end{equation}
+```
+implying that there is a pressure gradient across the boundary. Since we solve the pressure poisson 
+equation (``\nabla^2p^{n+1}=\frac{\boldsymbol{\nabla}\cdot\boldsymbol{u}^\star}{\Delta t}``)
+using the method described by [Schumann88](@citet) we have to move inhomogeneus boundary conditions
+on the pressure to the right hand side. In order to do this we define a new field ``\phi`` where
+```math
+    \begin{equation}
+    \label{eq:modified_pressure_field}
+    \phi = p^{n+1} \quad \text{inside} \quad \Omega \quad \text{but} \quad \boldsymbol{\nabla} \cdot \boldsymbol{\nabla} \phi \, \big |_{\partial\Omega} = 0.
+    \end{equation}
+```
+This moves the boundary condition to the right hand side as ``\phi`` becomes
+```math
+    \begin{equation}
+    \label{eq:modified_pressure_poisson}
+    \boldsymbol{\nabla}^2\phi^{n+1} = \boldsymbol{\nabla}\cdot\left[\frac{\boldsymbol{u}^\star}{\Delta t} - \delta\left(\boldsymbol{x} - \boldsymbol{x}_\Omega\right)\boldsymbol{\nabla}p\right].
+    \end{equation}
+```
+Given the boundary condition on pressure given above, we can define a new modified predictor velocity
+which is equal to the predictor velocity within the domain but shares boundary conditions with the 
+corrected field,
+```math
+    \begin{equation}
+    \label{eq:quasi_predictor_velocity}
+    \tilde{\boldsymbol{u}}^\star:=\boldsymbol{u}^\star + \delta\left(\boldsymbol{x} - \boldsymbol{x}_\Omega\right)(\boldsymbol{u}^{n+1} - \boldsymbol{u}^\star).
+    \end{equation}
+```
+The modified pressure poisson equation becomes ``\nabla^2p^{n+1}=\frac{\boldsymbol{\nabla}\cdot\tilde{\boldsymbol{u}}^\star}{\Delta t}``
+which can easily be solved. 
+
+Perhaps a more intuitive way to consider this is to recall that the corrector step projects ``\boldsymbol{u}^\star``
+to the space of divergenece free velocity by applying
+```math
+    \begin{equation}
+    \label{eq:pressure_correction_step}
+    \boldsymbol{u}^{n+1} = \boldsymbol{u}^\star - \Delta t\boldsymbol{\nabla}p^{n+1},
+    \end{equation}
+```
+but we have changed ``p^{n+1}`` to ``\phi`` and ``\boldsymbol{u}^\star`` to ``\tilde{\boldsymbol{u}}^\star``
+so for ``\boldsymbol{\nabla}\phi \big |_{\partial\Omega} = 0`` the modified predictor velocity must
+equal the corrected velocity on the boundary.
+
+For simple open boundary conditions such as no penetration or a straight forward prescription of
+a known velocity at ``t^{n+1}`` this is simple to implement as we just set the boundary condition
+on the predictor velocity and don't change it after the correction. But some open boundary methods
+calculate the boundary value based on the interior solution. As a simple example, if we wanted to 
+set the wall normal veloicty gradient to zero at the west boundary then we would set the boundary 
+point to
+```math
+    \begin{equation}
+    \label{eq:zero_wall_normal_velocity_gradient}
+    u^\star_{1jk} \approx u^\star_{3jk} + (u^\star_{2jk} - u^\star_{jk4}) / 2 + \mathcal{O}(\Delta x^2),
+    \end{equation}
+```
+but we then pressure correct the interior so a new ``\mathcal{O}(\Delta t)`` error is introduced as
+```math
+    \begin{equation}
+    \begin{align}
+    u^{n+1}_{1jk} &\approx u^{n+1}_{3jk} + (u^{n+1}_{2jk} - u^{n+1}_{jk4}) / 2 + \mathcal{O}(\Delta x^2),\\
+    &= u^\star_{1jk} - \Delta t \left(\boldsymbol{\nabla}p^{n+1}_{3jk} + (\boldsymbol{\nabla}p^{n+1}_{2jk} - \boldsymbol{\nabla}p^{n+1}_{4jk}) / 2\right) + \mathcal{O}(\Delta x^2),\\
+    &\approx u^\star_{1jk} + \mathcal{O}(\Delta x^2) + \mathcal{O}(\Delta t).
+    \end{align}
+    \end{equation}
+```
+This is prefered to a divergent interior solution as open boundary conditions (except no penetration)
+are typlically already unphysical and only used in an attempt to allow information to enter or exit
+the domain.
+
+Open boundary conditions are represented by the [`Open`](@ref) type.
\ No newline at end of file
diff --git a/docs/src/numerical_implementation/finite_volume.md b/docs/src/numerical_implementation/finite_volume.md
index 091501f60b..9e2378bb43 100644
--- a/docs/src/numerical_implementation/finite_volume.md
+++ b/docs/src/numerical_implementation/finite_volume.md
@@ -1,4 +1,4 @@
-# Finite volume method on a staggered grid
+# [Finite volume method on a staggered grid](@id finite_volume)
 
 The `Oceananigans.jl` staggered grid is defined by a rectilinear array of cuboids of horizontal dimensions 
 ``\Delta x_{i, j, k}, \Delta y_{i, j, k}`` and vertical dimension 
diff --git a/docs/src/operations.md b/docs/src/operations.md
new file mode 100644
index 0000000000..e50e34f1e9
--- /dev/null
+++ b/docs/src/operations.md
@@ -0,0 +1,211 @@
+# Operations and averaging
+
+`Field`s are more than mere vessels for data.
+They come equipped with a powerful infrastructure for assembling expression trees that represent
+arithmetic and discrete calculus operations.
+We also supply a system for computing reductions (averages, integrals, and cumulative integrals) of `Field`s.
+This infrastructure can be used to construct initial conditions, set up diagnostic calculations that are performed
+and saved to disk while a simulation runs, and also for post-processing.
+
+We start by constructing a `CenterField` on a simple grid,
+
+```jldoctest operations
+using Oceananigans
+
+grid = RectilinearGrid(topology = (Periodic, Flat, Bounded),
+                       size = (4, 4),
+                       x = (0, 2π),
+                       z = (-4, 0))
+
+c = CenterField(grid)
+
+periodic_but_decaying(x, z) = sin(x) * exp(z)
+set!(c, periodic_but_decaying)
+
+# output
+4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+├── grid: 4×1×4 RectilinearGrid{Float64, Periodic, Flat, Bounded} on CPU with 3×0×3 halo
+├── boundary conditions: FieldBoundaryConditions
+│   └── west: Periodic, east: Periodic, south: Nothing, north: Nothing, bottom: ZeroFlux, top: ZeroFlux, immersed: ZeroFlux
+└── data: 10×1×10 OffsetArray(::Array{Float64, 3}, -2:7, 1:1, -2:7) with eltype Float64 with indices -2:7×1:1×-2:7
+    └── max=0.428882, min=-0.428882, mean=1.04083e-17
+```
+
+An `AbstractOperation` (or _operation_ for short) differs from a `Field` in that only represents a computation.
+Unlike `Field`s, `AbstractOperation`s do not have any data, and are associated only with minimal memory allocation.
+`AbstractOperations` are generated by inflicting `Field`s with ordinary arithmetic expressions,
+
+```jldoctest operations
+two_c = 2 * c
+
+# output
+BinaryOperation at (Center, Center, Center)
+├── grid: 4×1×4 RectilinearGrid{Float64, Periodic, Flat, Bounded} on CPU with 3×0×3 halo
+└── tree:
+    * at (Center, Center, Center)
+    ├── 2
+    └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+```
+
+and even by chaining expressions together, which may themselves include `AbstractOperations`,
+
+```jldoctest operations
+quadratic = c^2 + two_c + 1
+
+# output
+BinaryOperation at (Center, Center, Center)
+├── grid: 4×1×4 RectilinearGrid{Float64, Periodic, Flat, Bounded} on CPU with 3×0×3 halo
+└── tree:
+    + at (Center, Center, Center)
+    ├── + at (Center, Center, Center)
+    │   ├── ^ at (Center, Center, Center)
+    │   │   ├── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+    │   │   └── 2
+    │   └── * at (Center, Center, Center)
+        │   ├── 2
+        │   └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+    └── 1
+```
+
+Like `Field`s, `AbstractOperations` have a location and a grid. 
+In addition to `BinaryOperation`s like the kind above, `UnaryOperation`s and `MultiaryOperation`s are also supported,
+
+```jldoctest operations
+cos_c = cos(c)
+
+# output
+UnaryOperation at (Center, Center, Center)
+├── grid: 4×1×4 RectilinearGrid{Float64, Periodic, Flat, Bounded} on CPU with 3×0×3 halo
+└── tree:
+    cos at (Center, Center, Center) via identity
+    └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+```
+
+```jldoctest operations
+four_c = c + c + c + c
+
+# output
+MultiaryOperation at (Center, Center, Center)
+├── grid: 4×1×4 RectilinearGrid{Float64, Periodic, Flat, Bounded} on CPU with 3×0×3 halo
+└── tree:
+    + at (Center, Center, Center)
+    ├── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+    ├── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+    ├── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+    └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+```
+
+`UnaryOperation`, `BinaryOperation` and `MultiaryOperation` all have both an "operator", and between 1 and many.
+Last, and definitely not least, the fourth flavor of `AbstractOperation` represents a derivative,
+
+```jldoctest operations
+dx_c = ∂x(c)
+
+# output
+Derivative at (Face, Center, Center)
+├── grid: 4×1×4 RectilinearGrid{Float64, Periodic, Flat, Bounded} on CPU with 3×0×3 halo
+└── tree:
+    ∂xᶠᶜᶜ at (Face, Center, Center) via identity
+    └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+```
+
+!!! note
+    The ``x``-derivative of `c` is computed by invoking the function `∂xᶠᶜᶜ`,
+    where the superscript `ᶠᶜᶜ` indicates that the result of this function
+    is located at `(Face, Center, Center)`. This convention is used throughout
+    `Oceananigans` source code. A third superscripted letter `ᵃ` stands for "any"
+    location and applies to operators that are location-agnostic in the
+    given direction.
+
+Like `Field`s, `AbstractOperations` are evaluated by indexing,
+
+```jldoctest operations
+@show c[1, 1, 1]
+@show quadratic[1, 1, 1]
+nothing
+
+# output
+c[1, 1, 1] = 0.02135277459201165
+quadratic[1, 1, 1] = 1.0431614901668005
+```
+
+Also like `Field`s, `AbstractOperation`s have a _location_. For example,
+
+```jldoctest operations
+@show location(c)
+@show location(dx_c)
+nothing
+
+# output
+location(c) = (Center, Center, Center)
+location(dx_c) = (Face, Center, Center)
+```
+
+Notice that the location of `dx_c` is shifted in `x` relative to `c`.
+Likewise, `y`-derivatives are shifted in `y` and `z`-derivatives are shifted in `z`.
+
+## Locations and interpolation
+
+_Reconstruction_ of `Field`s from one location to another is intrinsic to arithmetic on the staggered grid.
+Consider the magnitude of the gradient of `c`:
+
+```jldoctest operations
+∇c² = ∂x(c)^2 + ∂z(c)^2
+
+# output
+BinaryOperation at (Face, Center, Center)
+├── grid: 4×1×4 RectilinearGrid{Float64, Periodic, Flat, Bounded} on CPU with 3×0×3 halo
+└── tree:
+    + at (Face, Center, Center)
+    ├── ^ at (Face, Center, Center)
+    │   ├── ∂xᶠᶜᶜ at (Face, Center, Center) via identity
+    │   │   └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+    │   └── 2
+    └── ^ at (Center, Center, Face)
+        ├── ∂zᶜᶜᶠ at (Center, Center, Face) via identity
+        │   └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+        └── 2
+```
+
+Because `∂x(c)^2` is located at `(Face, Center, Center)` and `∂z(c)^2` is located at `(Center, Center, Face)`,
+a decision has to be made to compute `∇c²`.
+By default, `AbstractOperations` are reconstructed at the location of the _first_ object in the expression.
+So
+
+```jldoctest operations
+∇c²_ccf = ∂z(c)^2 + ∂x(c)^2
+
+# output
+BinaryOperation at (Center, Center, Face)
+├── grid: 4×1×4 RectilinearGrid{Float64, Periodic, Flat, Bounded} on CPU with 3×0×3 halo
+└── tree:
+    + at (Center, Center, Face)
+    ├── ^ at (Center, Center, Face)
+    │   ├── ∂zᶜᶜᶠ at (Center, Center, Face) via identity
+    │   │   └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+    │   └── 2
+    └── ^ at (Face, Center, Center)
+        ├── ∂xᶠᶜᶜ at (Face, Center, Center) via identity
+        │   └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+        └── 2
+```
+
+ends up at `(Center, Center, Face)`. To control the location of an operation we use the macro `@at`,
+
+```jldoctest operations
+∇c²_ccc = @at (Center, Center, Center) ∂x(c)^2 + ∂z(c)^2
+
+# output
+BinaryOperation at (Center, Center, Center)
+├── grid: 4×1×4 RectilinearGrid{Float64, Periodic, Flat, Bounded} on CPU with 3×0×3 halo
+└── tree:
+    + at (Center, Center, Center)
+    ├── ^ at (Center, Center, Center)
+    │   ├── ∂xᶠᶜᶜ at (Center, Center, Center) via ℑxᶜᵃᵃ
+    │   │   └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+    │   └── 2
+    └── ^ at (Center, Center, Center)
+        ├── ∂zᶜᶜᶠ at (Center, Center, Center) via ℑzᵃᵃᶜ
+        │   └── 4×1×4 Field{Center, Center, Center} on RectilinearGrid on CPU
+        └── 2
+```
diff --git a/docs/src/physics/hydrostatic_free_surface_model.md b/docs/src/physics/hydrostatic_free_surface_model.md
index c4a5bcf623..f70a1bf86e 100644
--- a/docs/src/physics/hydrostatic_free_surface_model.md
+++ b/docs/src/physics/hydrostatic_free_surface_model.md
@@ -3,17 +3,17 @@
 The [`HydrostaticFreeSurfaceModel`](@ref) solves the incompressible Navier-Stokes equations under
 the Boussinesq and hydrostatic approximations and with an arbitrary number of tracer conservation 
 equations. Physics associated with individual terms in the momentum and tracer conservation
-equations --- the background rotation rate of the equation's reference frame,
+equations -- the background rotation rate of the equation's reference frame,
 gravitational effects associated with buoyant tracers under the Boussinesq
 approximation, generalized stresses and tracer fluxes associated with viscous and
-diffusive physics, and arbitrary "forcing functions" --- are determined by the whims of the
+diffusive physics, and arbitrary "forcing functions" -- are determined by the whims of the
 user.
 
 ## Mass conservation and free surface evolution equation
 
 The mass conservation equation is
 ```math
-    0 = \boldsymbol{\nabla} \boldsymbol{\cdot} \boldsymbol{u} + \partial_z w \, .  
+    0 = \boldsymbol{\nabla}_h \boldsymbol{\cdot} \boldsymbol{u} + \partial_z w \, .  
 ```
 
 Given the horizontal flow ``\boldsymbol{u}`` we use the above to diagnose the vertical velocity ``w``.
diff --git a/docs/src/physics/nonhydrostatic_model.md b/docs/src/physics/nonhydrostatic_model.md
index 069408ad5c..a6be7204e7 100644
--- a/docs/src/physics/nonhydrostatic_model.md
+++ b/docs/src/physics/nonhydrostatic_model.md
@@ -3,10 +3,10 @@
 The [`NonhydrostaticModel`](@ref) solves the incompressible Navier-Stokes equations under the
 Boussinesq approximation and an arbitrary number of tracer conservation equations.
 Physics associated with individual terms in the momentum and tracer conservation
-equations --- the background rotation rate of the equation's reference frame,
+equations -- the background rotation rate of the equation's reference frame,
 gravitational effects associated with buoyant tracers under the Boussinesq
 approximation, generalized stresses and tracer fluxes associated with viscous and
-diffusive physics, and arbitrary "forcing functions" --- are determined by the whims of the
+diffusive physics, and arbitrary "forcing functions" -- are determined by the whims of the
 user.
 
 ## The momentum conservation equation
diff --git a/docs/src/physics/turbulence_closures.md b/docs/src/physics/turbulence_closures.md
index 961bdbf463..59d12e842d 100644
--- a/docs/src/physics/turbulence_closures.md
+++ b/docs/src/physics/turbulence_closures.md
@@ -75,13 +75,12 @@ strain rate.
 The eddy viscosity is given by
 ```math
     \begin{align}
-    \nu_e = \left ( C \Delta_f \right )^2 \sqrt{ \Sigma^2 } \, \varsigma(N^2 / \Sigma^2) + \nu \, ,
+    \nu_e = \left ( C \Delta_f \right )^2 \sqrt{ \Sigma^2 } \, \varsigma(N^2 / \Sigma^2) \, ,
     \label{eq:smagorinsky-viscosity}
     \end{align}
 ```
-where ``\Delta_f`` is the "filter width" associated with the finite volume grid spacing,
-``C`` is a user-specified model constant, ``\Sigma^2 \equiv \Sigma_{ij} \Sigma_{ij}``, and
-``\nu`` is a constant isotropic background viscosity.
+where ``\Delta_f`` is the "filter width" associated with the finite volume grid spacing and
+``C`` is a user-specified model constant, ``\Sigma^2 \equiv \Sigma_{ij} \Sigma_{ij}``.
 The factor ``\varsigma(N^2 / \Sigma^2)`` reduces ``\nu_e`` in regions of
 strong stratification via
 ```math
@@ -103,9 +102,8 @@ The effect of subgrid turbulence on tracer mixing is also modeled diffusively vi
 ```
 where the eddy diffusivity ``\kappa_e`` is
 ```math
-\kappa_e = \frac{\nu_e - \nu}{Pr} + \kappa \, ,
+\kappa_e = \frac{\nu_e}{Pr} + \kappa \, .
 ```
-where ``\kappa`` is a constant isotropic background diffusivity.
 Both ``Pr`` and ``\kappa`` may be set independently for each tracer.
 
 ## Anisotropic minimum dissipation (AMD) turbulence closure
diff --git a/docs/src/quick_start.md b/docs/src/quick_start.md
index 34312b9454..923296262c 100644
--- a/docs/src/quick_start.md
+++ b/docs/src/quick_start.md
@@ -2,10 +2,19 @@
 
 This code:
 
+```@setup cpu
+using CairoMakie
+CairoMakie.activate!(type = "png")
+```
+
 ```@example cpu
 using Oceananigans
 
-grid = RectilinearGrid(size=(128, 128), x=(0, 2π), y=(0, 2π), topology=(Periodic, Periodic, Flat))
+grid = RectilinearGrid(size = (128, 128),
+                       x = (0, 2π),
+                       y = (0, 2π),
+                       topology = (Periodic, Periodic, Flat))
+
 model = NonhydrostaticModel(; grid, advection=WENO())
 
 ϵ(x, y) = 2rand() - 1
@@ -15,7 +24,8 @@ simulation = Simulation(model; Δt=0.01, stop_iteration=100)
 run!(simulation)
 ```
 
-runs 100 time steps of a two-dimensional turbulence simulation with `128²` [finite volume](https://en.wikipedia.org/wiki/Finite_volume_method) cells and a fifth-order upwinded [WENO advection scheme](https://en.wikipedia.org/wiki/WENO_methods).
+runs 100 time steps of a two-dimensional turbulence simulation with `128²` [finite volume](https://en.wikipedia.org/wiki/Finite_volume_method) cells
+and a fifth-order upwinded [WENO advection scheme](https://en.wikipedia.org/wiki/WENO_methods).
 It's quite similar to the [two-dimensional turbulence example](https://clima.github.io/OceananigansDocumentation/stable/generated/two_dimensional_turbulence/).
 
 ## Visualization
@@ -29,7 +39,7 @@ u, v, w = model.velocities
 ζ = Field(∂x(v) - ∂y(u))
 compute!(ζ)
 
-heatmap(interior(ζ, :, :, 1))
+heatmap(ζ, axis=(; aspect=1))
 ```
 
 A few more time-steps, and it's starting to get a little diffuse!
@@ -39,28 +49,60 @@ simulation.stop_iteration += 400
 run!(simulation)
 
 compute!(ζ)
-heatmap(interior(ζ, :, :, 1))
+heatmap(ζ, axis=(; aspect=1))
 ```
 
 ## They always cheat with too-simple "quick" starts
 
-Fine, we'll re-run this code on the GPU then:
+Fine, we'll re-run this code on the GPU. But we're a little greedy, so we'll also
+crank up the resolution, throw in a `TimeStepWizard` to update `simulation.Δt` adaptively,
+and add a passive tracer initially concentrated in the center of the domain
+which will make for an even prettier figure of the final state:
 
-```julia
+```@setup gpu
+using CairoMakie
+CairoMakie.activate!(type = "png")
+```
+
+```@example gpu
 using Oceananigans
+using CairoMakie
 
-grid = RectilinearGrid(GPU(), size=(128, 128), x=(0, 2π), y=(0, 2π), topology=(Periodic, Periodic, Flat))
-model = NonhydrostaticModel(; grid, advection=WENO())
+grid = RectilinearGrid(GPU(),
+                       size = (1024, 1024),
+                       x = (-π, π),
+                       y = (-π, π),
+                       topology = (Periodic, Periodic, Flat))
 
+model = NonhydrostaticModel(; grid, advection=WENO(), tracers=:c)
+
+δ = 0.5
+cᵢ(x, y) = exp(-(x^2 + y^2) / 2δ^2)
 ϵ(x, y) = 2rand() - 1
-set!(model, u=ϵ, v=ϵ)
+set!(model, u=ϵ, v=ϵ, c=cᵢ)
 
-simulation = Simulation(model; Δt=0.01, stop_iteration=100)
+simulation = Simulation(model; Δt=1e-3, stop_time=10)
+conjure_time_step_wizard!(simulation, cfl=0.2, IterationInterval(10))
 run!(simulation)
+
+u, v, w = model.velocities
+ζ = Field(∂x(v) - ∂y(u))
+compute!(ζ)
+
+fig = Figure(size=(1200, 600))
+axζ = Axis(fig[1, 1], aspect=1, title="vorticity")
+axc = Axis(fig[1, 2], aspect=1, title="tracer")
+heatmap!(axζ, ζ, colormap=:balance)
+heatmap!(axc, model.tracers.c)
+current_figure()
 ```
 
-Notice the difference? We passed the positional argument `GPU()` to `RectilinearGrid`.
+See how we did that? We passed the positional argument `GPU()` to `RectilinearGrid`.
+(This only works if a GPU is available, of course, and
+[CUDA.jl is configured](https://cuda.juliagpu.org/stable/installation/overview/).)
 
 ## Well, that was tantalizing
 
-But you'll need to know a lot more to become a productive, Oceananigans-wielding computational scientist (spherical grids, forcing, boundary conditions, turbulence closures, output writing, actually labeling your axes... 🤯). It'd be best to move on to the [one-dimensional diffusion example](@ref one_dimensional_diffusion_example).
+But you'll need to know a lot more to become a productive, Oceananigans-wielding computational scientist (spherical grids, forcing, boundary conditions,
+turbulence closures, output writing, actually labeling your axes... 🤯).
+It'd be best to move on to the [one-dimensional diffusion example](@ref one_dimensional_diffusion_example).
diff --git a/docs/src/simulation_tips.md b/docs/src/simulation_tips.md
index ab9382d088..b285b4ad6d 100644
--- a/docs/src/simulation_tips.md
+++ b/docs/src/simulation_tips.md
@@ -174,7 +174,7 @@ compute!(ε)
 ### Try to decrease the memory-use of your runs
 
 GPU runs are sometimes memory-limited. A state-of-the-art Tesla V100 GPU has 32GB of
-memory --- enough memory for simulations with about 100 million points, or grids a bit smaller
+memory -- enough memory for simulations with about 100 million points, or grids a bit smaller
 than 512 × 512 × 512. (The maximum grid size depends on some user-specified factors,
 like the number of passive tracers or computed diagnostics.)
 For large simulations on the GPU, careful management of memory allocation may be required:
@@ -232,7 +232,7 @@ julia> grid = RectilinearGrid(GPU(); size=(1, 1, 1), extent=(1, 1, 1), halo=(1,
 julia> model = NonhydrostaticModel(; grid)
 NonhydrostaticModel{GPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 1×1×1 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on GPU with 1×1×1 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── tracers: ()
 ├── closure: Nothing
 ├── buoyancy: Nothing
@@ -275,7 +275,7 @@ Here `CUDA.jl` throws an error because scalar `getindex` is not `allowed`. There
 overcome this limitation and allow scalar indexing (more about that 
 in the [CUDA.jl documentation](https://cuda.juliagpu.org/stable/usage/workflow/#UsageWorkflowScalar)), but this option
 can be very slow on GPUs, so it is advised to only use this last method when using the REPL or 
-prototyping --- never in production-ready scripts.
+prototyping -- never in production-ready scripts.
 
 You might also need to keep these differences in mind when using arrays
 to define initial conditions, boundary conditions or
diff --git a/docs/src/simulations.md b/docs/src/simulations.md
new file mode 100644
index 0000000000..375437d947
--- /dev/null
+++ b/docs/src/simulations.md
@@ -0,0 +1,10 @@
+# Simulations
+
+`Simulation`s are basically a utility for managing a time-stepping loop, including scheduling
+important activities such as:
+
+* Logging the progress of a simulation,
+* Computing and writing diagnostics or other output to disk,
+* Stopping a simulation when its time has come.
+
+The most important line in any script is `run!(simulation)`.
diff --git a/examples/baroclinic_adjustment.jl b/examples/baroclinic_adjustment.jl
index ffb9b2d9a4..a14c258753 100644
--- a/examples/baroclinic_adjustment.jl
+++ b/examples/baroclinic_adjustment.jl
@@ -80,8 +80,8 @@ x, y, z = 1e-3 .* nodes(grid, (Center(), Center(), Center()))
 
 b = model.tracers.b
 
-fig, ax, hm = heatmap(y, z, interior(b)[1, :, :],
-                      colormap=:deep,
+fig, ax, hm = heatmap(view(b, 1, :, :),
+                      colormap = :deep,
                       axis = (xlabel = "y [km]",
                               ylabel = "z [km]",
                               title = "b(x=0, y, z, t=0)",
@@ -178,7 +178,7 @@ using CairoMakie
 
 # ### Three-dimensional visualization
 #
-# We load the saved buoyancy output on the top, bottom, north, and east surface as `FieldTimeSeries`es.
+# We load the saved buoyancy output on the top, north, and east surface as `FieldTimeSeries`es.
 
 filename = "baroclinic_adjustment"
 
@@ -188,7 +188,6 @@ slice_filenames = NamedTuple(side => filename * "_$(side)_slice.jld2" for side i
 
 b_timeserieses = (east   = FieldTimeSeries(slice_filenames.east, "b"),
                   north  = FieldTimeSeries(slice_filenames.north, "b"),
-                  bottom = FieldTimeSeries(slice_filenames.bottom, "b"),
                   top    = FieldTimeSeries(slice_filenames.top, "b"))
 
 B_timeseries = FieldTimeSeries(filename * "_zonal_average.jld2", "b")
@@ -204,6 +203,7 @@ xb = xb ./ 1e3 # convert m -> km
 yb = yb ./ 1e3 # convert m -> km
 
 Nx, Ny, Nz = size(grid)
+
 x_xz = repeat(x, 1, Nz)
 y_xz_north = y[end] * ones(Nx, Nz)
 z_xz = repeat(reshape(z, 1, Nz), Nx, 1)
@@ -215,7 +215,6 @@ z_yz = repeat(reshape(z, 1, Nz), grid.Ny, 1)
 x_xy = x
 y_xy = y
 z_xy_top = z[end] * ones(grid.Nx, grid.Ny)
-z_xy_bottom = z[1] * ones(grid.Nx, grid.Ny)
 nothing #hide
 
 # Then we create a 3D axis. We use `zonal_slice_displacement` to control where the plot of the instantaneous
@@ -253,7 +252,6 @@ n = length(times)
 
 b_slices = (east   = interior(b_timeserieses.east[n], 1, :, :),
             north  = interior(b_timeserieses.north[n], :, 1, :),
-            bottom = interior(b_timeserieses.bottom[n], :, :, 1),
             top    = interior(b_timeserieses.top[n], :, :, 1))
 
 ## Zonally-averaged buoyancy
@@ -261,11 +259,11 @@ B = interior(B_timeseries[n], 1, :, :)
 
 clims = 1.1 .* extrema(b_timeserieses.top[n][:])
 
-kwargs = (colorrange=clims, colormap=:deep)
-surface!(ax, x_yz_east, y_yz, z_yz;    color = b_slices.east, kwargs...)
-surface!(ax, x_xz, y_xz_north, z_xz;   color = b_slices.north, kwargs...)
-surface!(ax, x_xy, y_xy, z_xy_bottom ; color = b_slices.bottom, kwargs...)
-surface!(ax, x_xy, y_xy, z_xy_top;     color = b_slices.top, kwargs...)
+kwargs = (colorrange=clims, colormap=:deep, shading=NoShading)
+
+surface!(ax, x_yz_east, y_yz, z_yz;  color = b_slices.east, kwargs...)
+surface!(ax, x_xz, y_xz_north, z_xz; color = b_slices.north, kwargs...)
+surface!(ax, x_xy, y_xy, z_xy_top;   color = b_slices.top, kwargs...)
 
 sf = surface!(ax, zonal_slice_displacement .* x_yz_east, y_yz, z_yz; color = B, kwargs...)
 
@@ -356,4 +354,3 @@ end
 nothing #hide
 
 # ![](baroclinic_adjustment.mp4)
-
diff --git a/examples/convecting_plankton.jl b/examples/convecting_plankton.jl
index 397448aeda..d7aa86fb8c 100644
--- a/examples/convecting_plankton.jl
+++ b/examples/convecting_plankton.jl
@@ -212,15 +212,9 @@ avg_P_timeseries = FieldTimeSeries(filepath, "avg_P")
 
 times = w_timeseries.times
 buoyancy_flux_time_series = [buoyancy_flux(0, t, buoyancy_flux_parameters) for t in times]
-nothing #hide
-
-# and then we construct the ``x, z`` grid,
-
-xw, yw, zw = nodes(w_timeseries)
-xp, yp, zp = nodes(P_timeseries)
-nothing #hide
+nothing # hide
 
-# Finally, we animate plankton mixing and blooming,
+# Now, we animate plankton mixing and blooming,
 
 using CairoMakie
 
@@ -230,9 +224,9 @@ n = Observable(1)
 
 title = @lift @sprintf("t = %s", prettytime(times[$n]))
 
-wₙ = @lift interior(w_timeseries[$n], :, 1, :)
-Pₙ = @lift interior(P_timeseries[$n], :, 1, :)
-avg_Pₙ = @lift interior(avg_P_timeseries[$n], 1, 1, :)
+wn = @lift w_timeseries[$n]
+Pn = @lift P_timeseries[$n]
+avg_Pn = @lift avg_P_timeseries[$n]
 
 w_lim = maximum(abs, interior(w_timeseries))
 w_lims = (-w_lim, w_lim)
@@ -250,17 +244,17 @@ xlims!(ax_avg_P, 0.85, 1.3)
 
 fig[1, 1:3] = Label(fig, title, tellwidth=false)
 
-hm_w = heatmap!(ax_w, xw, zw, wₙ; colormap = :balance, colorrange = w_lims)
+hm_w = heatmap!(ax_w, wn; colormap = :balance, colorrange = w_lims)
 Colorbar(fig[2, 1], hm_w; label = "Vertical velocity (m s⁻¹)", flipaxis = false)
 
-hm_P = heatmap!(ax_P, xp, zp, Pₙ; colormap = :matter, colorrange = P_lims)
+hm_P = heatmap!(ax_P, Pn; colormap = :matter, colorrange = P_lims)
 Colorbar(fig[3, 1], hm_P; label = "Plankton 'concentration'", flipaxis = false)
 
 lines!(ax_b, times ./ hour, buoyancy_flux_time_series; linewidth = 1, color = :black, alpha = 0.4)
 
 b_flux_point = @lift Point2(times[$n] / hour, buoyancy_flux_time_series[$n])
 scatter!(ax_b, b_flux_point; marker = :circle, markersize = 16, color = :black)
-lines!(ax_avg_P, avg_Pₙ, zp)
+lines!(ax_avg_P, avg_Pn)
 
 current_figure() #hide
 fig
diff --git a/examples/horizontal_convection.jl b/examples/horizontal_convection.jl
index b5485f7dc3..44fa24b6a3 100644
--- a/examples/horizontal_convection.jl
+++ b/examples/horizontal_convection.jl
@@ -30,14 +30,14 @@ using Printf
 
 # ### The grid
 
-H = 1.0          # vertical domain extent
+H = 1            # vertical domain extent
 Lx = 2H          # horizontal domain extent
 Nx, Nz = 128, 64 # horizontal, vertical resolution
 
 grid = RectilinearGrid(size = (Nx, Nz),
-                          x = (-Lx/2, Lx/2),
-                          z = (-H, 0),
-                   topology = (Bounded, Flat, Bounded))
+                       x = (-Lx/2, Lx/2),
+                       z = (-H, 0),
+                       topology = (Bounded, Flat, Bounded))
 
 # ### Boundary conditions
 #
@@ -48,8 +48,7 @@ grid = RectilinearGrid(size = (Nx, Nz),
 # while zero-flux boundary conditions are imposed on all other boundaries. We use free-slip 
 # boundary conditions on ``u`` and ``w`` everywhere.
 
-b★ = 1.0
-
+b★ = 1  
 @inline bˢ(x, t, p) = - p.b★ * cos(2π * x / p.Lx)
 
 b_bcs = FieldBoundaryConditions(top = ValueBoundaryCondition(bˢ, parameters=(; b★, Lx)))
@@ -77,7 +76,7 @@ b_bcs = FieldBoundaryConditions(top = ValueBoundaryCondition(bˢ, parameters=(;
 # We use isotropic viscosity and diffusivities, `ν` and `κ` whose values are obtain from the
 # prescribed ``Ra`` and ``Pr`` numbers. Here, we use ``Pr = 1`` and ``Ra = 10^8``:
 
-Pr = 1.0    # Prandtl number
+Pr = 1      # Prandtl number
 Ra = 1e8    # Rayleigh number
 
 ν = sqrt(Pr * b★ * Lx^3 / Ra)  # Laplacian viscosity
@@ -107,19 +106,15 @@ simulation = Simulation(model, Δt=1e-2, stop_time=40.0)
 # ### The `TimeStepWizard`
 #
 # The `TimeStepWizard` manages the time-step adaptively, keeping the Courant-Freidrichs-Lewy 
-# (CFL) number close to `0.75` while ensuring the time-step does not increase beyond the 
-# maximum allowable value for numerical stability while avoiding the time-step to change more than
-# the predefined maximum change (`new_Δt ≤ max_change * old_Δt`).
-
-wizard = TimeStepWizard(cfl=0.75, max_change=1.2, max_Δt=1e-1)
+# (CFL) number close to `0.7`.
 
-simulation.callbacks[:wizard] = Callback(wizard, IterationInterval(50))
+conjure_time_step_wizard!(simulation, IterationInterval(50), cfl=0.7, max_Δt=1e-1)
 
 # ### A progress messenger
 #
 # We write a function that prints out a helpful progress message while the simulation runs.
 
-progress(sim) = @printf("i: % 6d, sim time: % 1.3f, wall time: % 10s, Δt: % 1.4f, advective CFL: %.2e, diffusive CFL: %.2e\n",
+progress(sim) = @printf("Iter: % 6d, sim time: % 1.3f, wall time: % 10s, Δt: % 1.4f, advective CFL: %.2e, diffusive CFL: %.2e\n",
                         iteration(sim), time(sim), prettytime(sim.run_wall_time),
                         sim.Δt, AdvectiveCFL(sim.Δt)(sim.model), DiffusiveCFL(sim.Δt)(sim.model))
 
@@ -192,9 +187,9 @@ nothing #hide
 
 χ_timeseries = deepcopy(b_timeseries)
 
-for i in 1:length(times)
-  bᵢ = b_timeseries[i]
-  χ_timeseries[i] .= @at (Center, Center, Center) κ * (∂x(bᵢ)^2 + ∂z(bᵢ)^2)
+for n in 1:length(times)
+    bn = b_timeseries[n]
+    χ_timeseries[n] .= @at (Center, Center, Center) κ * (∂x(bn)^2 + ∂z(bn)^2)
 end
 
 
@@ -206,10 +201,10 @@ n = Observable(1)
 
 title = @lift @sprintf("t=%1.2f", times[$n])
 
-sₙ = @lift interior(s_timeseries[$n], :, 1, :)
-ζₙ = @lift interior(ζ_timeseries[$n], :, 1, :)
-bₙ = @lift interior(b_timeseries[$n], :, 1, :)
-χₙ = @lift interior(χ_timeseries[$n], :, 1, :)
+sn = @lift s_timeseries[$n]
+ζn = @lift ζ_timeseries[$n]
+bn = @lift b_timeseries[$n]
+χn = @lift χ_timeseries[$n]
 
 slim = 0.6
 blim = 0.6
@@ -224,38 +219,26 @@ axis_kwargs = (xlabel = L"x / H",
 
 fig = Figure(size = (600, 1100))
 
-ax_s = Axis(fig[2, 1];
-            title = L"speed, $(u^2+w^2)^{1/2} / (L_x b_*) ^{1/2}", axis_kwargs...)
-
-ax_b = Axis(fig[3, 1];
-            title = L"buoyancy, $b / b_*$", axis_kwargs...)
+ax_s = Axis(fig[2, 1]; title = L"speed, $(u^2+w^2)^{1/2} / (L_x b_*)^{1/2}$", axis_kwargs...)
+ax_b = Axis(fig[3, 1]; title = L"buoyancy, $b / b_*$", axis_kwargs...)
+ax_ζ = Axis(fig[4, 1]; axis_kwargs...,
+            title = L"vorticity, $(∂u/∂z - ∂w/∂x) \, (L_x / b_*)^{1/2}$")
 
-ax_ζ = Axis(fig[4, 1];
-            title = L"vorticity, $(∂u/∂z - ∂w/∂x) \, (L_x / b_*)^{1/2}$", axis_kwargs...)
-
-ax_χ = Axis(fig[5, 1];
-            title = L"buoyancy dissipation, $κ |\mathbf{\nabla}b|^2 \, (L_x / {b_*}^5)^{1/2}$", axis_kwargs...)
+ax_χ = Axis(fig[5, 1]; axis_kwargs...,
+            title = L"buoyancy dissipation, $κ |\mathbf{\nabla}b|^2 \, (L_x / {b_*}^5)^{1/2}$")
 
 fig[1, :] = Label(fig, title, fontsize=24, tellwidth=false)
 
-hm_s = heatmap!(ax_s, xc, zc, sₙ;
-                colorrange = (0, slim),
-                colormap = :speed)
+hm_s = heatmap!(ax_s, sn; colorrange=(0, slim), colormap=:speed)
 Colorbar(fig[2, 2], hm_s)
 
-hm_b = heatmap!(ax_b, xc, zc, bₙ;
-                colorrange = (-blim, blim),
-                colormap = :thermal)
+hm_b = heatmap!(ax_b, bn; colorrange=(-blim, blim), colormap=:thermal)
 Colorbar(fig[3, 2], hm_b)
 
-hm_ζ = heatmap!(ax_ζ, xζ, zζ, ζₙ;
-                colorrange = (-ζlim, ζlim),
-                colormap = :balance)
+hm_ζ = heatmap!(ax_ζ, ζn; colorrange=(-ζlim, ζlim), colormap=:balance)
 Colorbar(fig[4, 2], hm_ζ)
 
-hm_χ = heatmap!(ax_χ, xc, zc, χₙ;
-                colorrange = (0, χlim),
-                colormap = :dense)
+hm_χ = heatmap!(ax_χ, χn; colorrange=(0, χlim), colormap=:dense)
 Colorbar(fig[5, 2], hm_χ)
 
 # And, finally, we record a movie.
@@ -325,15 +308,15 @@ nothing #hide
 # Now we can loop over the fields in the `FieldTimeSeries`, compute kinetic energy and ``Nu``,
 # and plot. We make use of `Integral` to compute the volume integral of fields over our domain.
 
-for i = 1:length(t)
-    ke = Field(Integral(1/2 * s_timeseries[i]^2 / (Lx * H)))
+for n = 1:length(t)
+    ke = Field(Integral(1/2 * s_timeseries[n]^2 / (Lx * H)))
     compute!(ke)
-    kinetic_energy[i] = ke[1, 1, 1]
+    kinetic_energy[n] = ke[1, 1, 1]
     
-    χ = Field(Integral(χ_timeseries[i] / (Lx * H)))
+    χ = Field(Integral(χ_timeseries[n] / (Lx * H)))
     compute!(χ)
 
-    Nu[i] = χ[1, 1, 1] / χ_diff
+    Nu[n] = χ[1, 1, 1] / χ_diff
 end
 
 fig = Figure(size = (850, 450))
@@ -346,3 +329,4 @@ lines!(ax_Nu, t, Nu; linewidth = 3)
 
 current_figure() #hide
 fig
+
diff --git a/examples/internal_tide.jl b/examples/internal_tide.jl
index 44bacda8d6..5b31f18418 100644
--- a/examples/internal_tide.jl
+++ b/examples/internal_tide.jl
@@ -14,6 +14,7 @@
 
 using Oceananigans
 using Oceananigans.Units
+using Oceananigans.ImmersedBoundaries: PartialCellBottom
 
 # ## Grid
 
@@ -46,13 +47,13 @@ width = 20kilometers
 hill(x) = h₀ * exp(-x^2 / 2width^2)
 bottom(x) = - H + hill(x)
 
-grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(bottom))
+grid = ImmersedBoundaryGrid(underlying_grid, PartialCellBottom(bottom))
 
 # Let's see how the domain with the bathymetry is.
 
 x = xnodes(grid, Center())
 bottom_boundary = interior(grid.immersed_boundary.bottom_height, :, 1, 1)
-top_boundary = 0*x
+top_boundary = 0 * x
 
 using CairoMakie
 
@@ -132,7 +133,7 @@ bᵢ(x, z) = Nᵢ² * z
 
 set!(model, u=uᵢ, b=bᵢ)
 
-# Now let's built a `Simulation`.
+# Now let's build a `Simulation`.
 
 Δt = 5minutes
 stop_time = 4days
@@ -205,19 +206,18 @@ wmax = maximum(abs, w_t[end])
 times = u′_t.times
 nothing #hide
 
-# For visualization purposes, we mask the region below the bathymetry with NaNs.
-
-using Oceananigans.ImmersedBoundaries: mask_immersed_field!
-
-for φ_t in (u′_t, w_t, N²_t), n in 1:length(times)
-    mask_immersed_field!(φ_t[n], NaN)
-end
-
 # We retrieve each field's coordinates and convert from meters to kilometers.
 
-xu,  yu,  zu  = nodes(u′_t[1]) ./ 1e3
-xw,  yw,  zw  = nodes(w_t[1])  ./ 1e3
-xN², yN², zN² = nodes(N²_t[1]) ./ 1e3
+xu,  _, zu  = nodes(u′_t[1])
+xw,  _, zw  = nodes(w_t[1])
+xN², _, zN² = nodes(N²_t[1])
+
+xu  = xu  ./ 1e3
+xw  = xw  ./ 1e3
+xN² = xN² ./ 1e3
+zu  = zu  ./ 1e3
+zw  = zw  ./ 1e3
+zN² = zN² ./ 1e3
 nothing #hide
 
 # ## Visualize
@@ -235,9 +235,9 @@ n = Observable(1)
 title = @lift @sprintf("t = %1.2f days = %1.2f T₂",
                        round(times[$n] / day, digits=2) , round(times[$n] / T₂, digits=2))
 
-u′ₙ = @lift interior(u′_t[$n], :, 1, :)
- wₙ = @lift interior( w_t[$n], :, 1, :)
-N²ₙ = @lift interior(N²_t[$n], :, 1, :)
+u′n = @lift u′_t[$n]
+ wn = @lift  w_t[$n]
+N²n = @lift N²_t[$n]
 
 axis_kwargs = (xlabel = "x [km]",
                ylabel = "z [km]",
@@ -249,15 +249,15 @@ fig = Figure(size = (700, 900))
 fig[1, :] = Label(fig, title, fontsize=24, tellwidth=false)
 
 ax_u = Axis(fig[2, 1]; title = "u'-velocity", axis_kwargs...)
-hm_u = heatmap!(ax_u, xu, zu, u′ₙ; colorrange = (-umax, umax), colormap = :balance)
+hm_u = heatmap!(ax_u, xu, zu, u′n; nan_color=:gray, colorrange=(-umax, umax), colormap=:balance)
 Colorbar(fig[2, 2], hm_u, label = "m s⁻¹")
 
 ax_w = Axis(fig[3, 1]; title = "w-velocity", axis_kwargs...)
-hm_w = heatmap!(ax_w, xw, zw, wₙ; colorrange = (-wmax, wmax), colormap = :balance)
+hm_w = heatmap!(ax_w, xw, zw, wn; nan_color=:gray, colorrange=(-wmax, wmax), colormap=:balance)
 Colorbar(fig[3, 2], hm_w, label = "m s⁻¹")
 
 ax_N² = Axis(fig[4, 1]; title = "stratification N²", axis_kwargs...)
-hm_N² = heatmap!(ax_N², xN², zN², N²ₙ; colorrange = (0.9Nᵢ², 1.1Nᵢ²), colormap = :thermal)
+hm_N² = heatmap!(ax_N², xN², zN², N²n; nan_color=:gray, colorrange=(0.9Nᵢ², 1.1Nᵢ²), colormap=:magma)
 Colorbar(fig[4, 2], hm_N², label = "s⁻²")
 
 fig
diff --git a/examples/internal_wave.jl b/examples/internal_wave.jl
index 748a52d8ef..9fb2ed12ae 100644
--- a/examples/internal_wave.jl
+++ b/examples/internal_wave.jl
@@ -153,15 +153,12 @@ nothing #hide
 n = Observable(1)
 
 w_timeseries = FieldTimeSeries(filename, "w")
-x, y, z = nodes(w_timeseries)
-
-w = @lift interior(w_timeseries[$n], :, 1, :)
+w = @lift w_timeseries[$n]
 w_lim = 1e-8
 
-contourf!(ax, x, z, w;
+contourf!(ax, w;
           levels = range(-w_lim, stop=w_lim, length=10),
           colormap = :balance,
-          colorrange = (-w_lim, w_lim),
           extendlow = :auto,
           extendhigh = :auto)
 
diff --git a/examples/one_dimensional_diffusion.jl b/examples/one_dimensional_diffusion.jl
index ea50b9d2b4..492a8f4504 100644
--- a/examples/one_dimensional_diffusion.jl
+++ b/examples/one_dimensional_diffusion.jl
@@ -73,11 +73,7 @@ set_theme!(Theme(fontsize = 24, linewidth=3))
 fig = Figure()
 axis = (xlabel = "Temperature (ᵒC)", ylabel = "z")
 label = "t = 0"
-
-z = znodes(model.tracers.T)
-T = interior(model.tracers.T, 1, 1, :)
-
-lines(T, z; label, axis)
+lines(model.tracers.T; label, axis)
 current_figure() #hide
 
 # The function `interior` above extracts a `view` of `model.tracers.T` over the
@@ -105,7 +101,7 @@ run!(simulation)
 using Printf
 
 label = @sprintf("t = %.3f", model.clock.time)
-lines!(interior(model.tracers.T, 1, 1, :), z; label)
+lines!(model.tracers.T; label)
 axislegend()
 current_figure() #hide
 
@@ -136,8 +132,8 @@ xlims!(ax, 0, 1)
 
 n = Observable(1)
 
-T = @lift interior(T_timeseries[$n], 1, 1, :)
-lines!(T, z)
+T = @lift T_timeseries[$n]
+lines!(T)
 
 label = @lift "t = " * string(round(times[$n], digits=3))
 Label(fig[1, 1], label, tellwidth=false)
diff --git a/examples/tilted_bottom_boundary_layer.jl b/examples/tilted_bottom_boundary_layer.jl
index 8211a8a6c2..0e1efdb500 100644
--- a/examples/tilted_bottom_boundary_layer.jl
+++ b/examples/tilted_bottom_boundary_layer.jl
@@ -106,17 +106,15 @@ B∞_field = BackgroundField(constant_stratification, parameters=(; ĝ, N² = N
 # This shows that to impose a no-flux boundary condition on the total buoyancy field ``B``, we must apply a boundary condition to the perturbation buoyancy ``b``,
 # ```math
 # ∂_z b = - N^{2} \cos{\theta}.
-#```
+# ```
 
 ∂z_b_bottom = - N² * cosd(θ)
 negative_background_diffusive_flux = GradientBoundaryCondition(∂z_b_bottom)
 b_bcs = FieldBoundaryConditions(bottom = negative_background_diffusive_flux)
 
-# ## Bottom drag
+# ## Bottom drag and along-slope interior velocity
 #
-# We impose bottom drag that follows Monin--Obukhov theory.
-# We include the background flow in the drag calculation,
-# which is the only effect the background flow enters the problem,
+# We impose bottom drag that follows Monin--Obukhov theory:
 
 V∞ = 0.1 # m s⁻¹
 z₀ = 0.1 # m (roughness length)
@@ -134,6 +132,13 @@ drag_bc_v = FluxBoundaryCondition(drag_v, field_dependencies=(:u, :v), parameter
 u_bcs = FieldBoundaryConditions(bottom = drag_bc_u)
 v_bcs = FieldBoundaryConditions(bottom = drag_bc_v)
 
+# Note that, similar to the buoyancy boundary conditions, we had to
+# include the background flow in the drag calculation.
+#
+# Let us also create `BackgroundField` for the along-slope interior velocity:
+
+V∞_field = BackgroundField(V∞)
+
 # ## Create the `NonhydrostaticModel`
 #
 # We are now ready to create the model. We create a `NonhydrostaticModel` with an
@@ -150,7 +155,7 @@ model = NonhydrostaticModel(; grid, buoyancy, coriolis, closure,
                             advection = UpwindBiasedFifthOrder(),
                             tracers = :b,
                             boundary_conditions = (u=u_bcs, v=v_bcs, b=b_bcs),
-                            background_fields = (; b=B∞_field))
+                            background_fields = (; b=B∞_field, v=V∞_field))
 
 # Let's introduce a bit of random noise at the bottom of the domain to speed up the onset of
 # turbulence:
diff --git a/examples/two_dimensional_turbulence.jl b/examples/two_dimensional_turbulence.jl
index 73c3859a74..e6807b963d 100644
--- a/examples/two_dimensional_turbulence.jl
+++ b/examples/two_dimensional_turbulence.jl
@@ -156,11 +156,11 @@ n = Observable(1)
 
 # Now let's plot the vorticity and speed.
 
-ω = @lift interior(ω_timeseries[$n], :, :, 1)
-s = @lift interior(s_timeseries[$n], :, :, 1)
+ω = @lift ω_timeseries[$n]
+s = @lift s_timeseries[$n]
 
-heatmap!(ax_ω, xω, yω, ω; colormap = :balance, colorrange = (-2, 2))
-heatmap!(ax_s, xs, ys, s; colormap = :speed, colorrange = (0, 0.2))
+heatmap!(ax_ω, ω; colormap = :balance, colorrange = (-2, 2))
+heatmap!(ax_s, s; colormap = :speed, colorrange = (0, 0.2))
 
 title = @lift "t = " * string(round(times[$n], digits=2))
 Label(fig[1, 1:2], title, fontsize=24, tellwidth=false)
diff --git a/ext/OceananigansEnzymeExt.jl b/ext/OceananigansEnzymeExt.jl
index c3b4954de8..e1b7d74a1f 100644
--- a/ext/OceananigansEnzymeExt.jl
+++ b/ext/OceananigansEnzymeExt.jl
@@ -13,59 +13,73 @@ isdefined(Base, :get_extension) ? (import Enzyme) : (import ..Enzyme)
 using Enzyme: EnzymeCore
 using Enzyme.EnzymeCore: Active, Const, Duplicated
 
+EnzymeCore.EnzymeRules.inactive_noinl(::typeof(Base.:(==)), ::Oceananigans.AbstractGrid, ::Oceananigans.AbstractGrid) = nothing
+EnzymeCore.EnzymeRules.inactive_noinl(::typeof(Oceananigans.AbstractOperations.validate_grid), x...) = nothing
+EnzymeCore.EnzymeRules.inactive_noinl(::typeof(Oceananigans.AbstractOperations.metric_function), x...) = nothing
 EnzymeCore.EnzymeRules.inactive_noinl(::typeof(Oceananigans.Utils.flatten_reduced_dimensions), x...) = nothing
 EnzymeCore.EnzymeRules.inactive(::typeof(Oceananigans.Grids.total_size), x...) = nothing
+EnzymeCore.EnzymeRules.inactive(::typeof(Oceananigans.BoundaryConditions.parent_size_and_offset), x...) = nothing
 @inline EnzymeCore.EnzymeRules.inactive_type(v::Type{Oceananigans.Utils.KernelParameters}) = true
 
 @inline batch(::Val{1}, ::Type{T}) where T = T
 @inline batch(::Val{N}, ::Type{T}) where {T, N} = NTuple{N, T}
 
-function EnzymeCore.EnzymeRules.augmented_primal(config,
-                                                 func::EnzymeCore.Const{Type{Field}},
-                                                 ::Type{<:EnzymeCore.Annotation{RT}},
-                                                 loc::Union{EnzymeCore.Const{<:Tuple},
-                                                 EnzymeCore.Duplicated{<:Tuple}},
-                                                 grid::EnzymeCore.Const{<:Oceananigans.Grids.AbstractGrid},
-                                                 T::EnzymeCore.Const{<:DataType}; kw...) where RT
-
-    primal = if EnzymeCore.EnzymeRules.needs_primal(config)
-        func.val(loc.val, grid.val, T.val; kw...)
-    else
-        nothing
-    end
-
-    if haskey(kw, :a)
-        # copy zeroing
-        kw[:data] = copy(kw[:data])
-    end
-
-    shadow = if EnzymeCore.EnzymeRules.width(config) == 1
-        func.val(loc.val, grid.val, T.val; kw...)
-    else
-        ntuple(Val(EnzymeCore.EnzymeRules.width(config))) do i
-            Base.@_inline_meta
-            func.val(loc.val, grid.val, T.val; kw...)
-        end
-    end
-
-    P = EnzymeCore.EnzymeRules.needs_primal(config) ? RT : Nothing
-    B = batch(Val(EnzymeCore.EnzymeRules.width(config)), RT)
-    return EnzymeCore.EnzymeRules.AugmentedReturn{P, B, Nothing}(primal, shadow, nothing)
-end
-
-#####
-##### Field
-#####
-
-function EnzymeCore.EnzymeRules.reverse(config::EnzymeCore.EnzymeRules.ConfigWidth{1},
-                                        func::EnzymeCore.Const{Type{Field}},
-                                        ::RT,
-                                        tape,
-                                        loc::Union{EnzymeCore.Const{<:Tuple}, EnzymeCore.Duplicated{<:Tuple}},
-                                        grid::EnzymeCore.Const{<:Oceananigans.Grids.AbstractGrid},
-                                        T::EnzymeCore.Const{<:DataType}; kw...) where RT
-    return (nothing, nothing, nothing)
-end
+# function EnzymeCore.EnzymeRules.augmented_primal(config,
+#                                                  func::EnzymeCore.Const{Type{Field}},
+#                                                  ::Type{<:EnzymeCore.Annotation{RT}},
+#                                                  loc::Union{EnzymeCore.Const{<:Tuple},
+#                                                  EnzymeCore.Duplicated{<:Tuple}},
+#                                                  grid::EnzymeCore.Annotation{<:Oceananigans.Grids.AbstractGrid},
+#                                                  T::EnzymeCore.Const{<:DataType}; kw...) where RT
+# 
+#     primal = if EnzymeCore.EnzymeRules.needs_primal(config)
+#         func.val(loc.val, grid.val, T.val; kw...)
+#     else
+#         nothing
+#     end
+# 
+#     if haskey(kw, :a)
+#         # copy zeroing
+#         kw[:data] = copy(kw[:data])
+#     end
+# 
+#     shadow = if EnzymeCore.EnzymeRules.width(config) == 1
+#         func.val(loc.val, grid.val, T.val; kw...)
+#     else
+#         ntuple(Val(EnzymeCore.EnzymeRules.width(config))) do i
+#             Base.@_inline_meta
+#             func.val(loc.val, grid.val, T.val; kw...)
+#         end
+#     end
+# 
+#     P = EnzymeCore.EnzymeRules.needs_primal(config) ? RT : Nothing
+#     B = batch(Val(EnzymeCore.EnzymeRules.width(config)), RT)
+#     return EnzymeCore.EnzymeRules.AugmentedReturn{P, B, Nothing}(primal, shadow, nothing)
+# end
+# 
+# #####
+# ##### Field
+# #####
+# 
+# function EnzymeCore.EnzymeRules.reverse(config::EnzymeCore.EnzymeRules.ConfigWidth{1},
+#                                         func::EnzymeCore.Const{Type{Field}},
+#                                         ::RT,
+#                                         tape,
+#                                         loc::Union{EnzymeCore.Const{<:Tuple}, EnzymeCore.Duplicated{<:Tuple}},
+#                                         grid::EnzymeCore.Const{<:Oceananigans.Grids.AbstractGrid},
+#                                         T::EnzymeCore.Const{<:DataType}; kw...) where RT
+#     return (nothing, nothing, nothing)
+# end
+# 
+# function EnzymeCore.EnzymeRules.reverse(config::EnzymeCore.EnzymeRules.ConfigWidth{1},
+#                                         func::EnzymeCore.Const{Type{Field}},
+#                                         ::RT,
+#                                         tape,
+#                                         loc::Union{EnzymeCore.Const{<:Tuple}, EnzymeCore.Duplicated{<:Tuple}},
+#                                         grid::EnzymeCore.Active{<:Oceananigans.Grids.AbstractGrid},
+#                                         T::EnzymeCore.Const{<:DataType}; kw...) where RT
+#     return (nothing, EnzymeCore.make_ero(grid), nothing)
+# end
 
 #####
 ##### FunctionField
@@ -73,255 +87,254 @@ end
 
 # @inline FunctionField(L::Tuple, func, grid) = FunctionField{L[1], L[2], L[3]}(func, grid)
 
-function EnzymeCore.EnzymeRules.augmented_primal(config,
-                                                 enzyme_func::Union{EnzymeCore.Const{<:Type{<:FunctionField}}, EnzymeCore.Const{Type{FT2}}},
-                                                 ::Type{<:EnzymeCore.Annotation{RT}},
-                                                 function_field_func,
-                                                 grid;
-                                                 clock = nothing,
-                                                 parameters = nothing) where {RT, FT2 <: FunctionField}
-
-    FunctionFieldType = enzyme_func.val     
-
-    primal = if EnzymeCore.EnzymeRules.needs_primal(config)
-        FunctionFieldType(function_field_func.val, grid.val; clock, parameters)
-    else
-        nothing
-    end
-
-    # function_field_func can be Active, Const (inactive), Duplicated (active but mutable)
-    function_field_is_active = function_field_func isa Active
-    # @show function_field_func
-
-    # Support batched differentiation!
-    config_width = EnzymeCore.EnzymeRules.width(config)
-
-    dactives = if function_field_is_active
-        if config_width == 1
-            Ref(EnzymeCore.make_zero(function_field_func.val))
-        else
-            ntuple(Val(config_width)) do i
-                Base.@_inline_meta
-                Ref(EnzymeCore.make_zero(function_field_func.val))
-            end
-        end
-    else
-        nothing
-    end
-
-    shadow = if config_width == 1
-        dfunction_field_func = if function_field_is_active
-            dactives[]
-        else
-            function_field_func.dval
-        end
-
-        FunctionFieldType(dfunction_field_func, grid.val; clock, parameters)
-    else
-  	    ntuple(Val(config_width)) do i
-  		    Base.@_inline_meta
-
-            dfunction_field_func = if function_field_is_active
-                dactives[i][]
-            else
-                function_field_func.dval[i]
-            end
-
-            FunctionFieldType(dfunction_field_func, grid.val; clock, parameters)
-  	    end
-    end
-
-    P = EnzymeCore.EnzymeRules.needs_primal(config) ? RT : Nothing
-    B = batch(Val(EnzymeCore.EnzymeRules.width(config)), RT)
-    D = typeof(dactives)
-
-    return EnzymeCore.EnzymeRules.AugmentedReturn{P, B, D}(primal, shadow, dactives)
-end
-
-function EnzymeCore.EnzymeRules.reverse(config,
-                                        enzyme_func::Union{EnzymeCore.Const{<:Type{<:FunctionField}}, EnzymeCore.Const{Type{FT2}}},
-                                        ::RT,
-                                        tape,
-                                        function_field_func,
-                                        grid;
-                                        clock = nothing,
-                                        parameters = nothing) where {RT, FT2 <: FunctionField}
-
-    dactives = if function_field_func isa Active
-        if EnzymeCore.EnzymeRules.width(config) == 1
-            tape[]
-        else
-            ntuple(Val(EnzymeCore.EnzymeRules.width(config))) do i
-                Base.@_inline_meta
-                tape[i][]
-            end
-        end
-    else
-        nothing
-    end
-
-    # return (dactives, grid (nothing))
-    return (dactives, nothing)
-end
+# function EnzymeCore.EnzymeRules.augmented_primal(config,
+#                                                  enzyme_func::Union{EnzymeCore.Const{<:Type{<:FunctionField}}, EnzymeCore.Const{Type{FT2}}},
+#                                                  ::Type{<:EnzymeCore.Annotation{RT}},
+#                                                  function_field_func,
+#                                                  grid;
+#                                                  clock = nothing,
+#                                                  parameters = nothing) where {RT, FT2 <: FunctionField}
+# 
+#     FunctionFieldType = enzyme_func.val     
+# 
+#     primal = if EnzymeCore.EnzymeRules.needs_primal(config)
+#         FunctionFieldType(function_field_func.val, grid.val; clock, parameters)
+#     else
+#         nothing
+#     end
+# 
+#     # function_field_func can be Active, Const (inactive), Duplicated (active but mutable)
+#     function_field_is_active = function_field_func isa Active
+#     # @show function_field_func
+# 
+#     # Support batched differentiation!
+#     config_width = EnzymeCore.EnzymeRules.width(config)
+# 
+#     dactives = if function_field_is_active
+#         if config_width == 1
+#             Ref(EnzymeCore.make_zero(function_field_func.val))
+#         else
+#             ntuple(Val(config_width)) do i
+#                 Base.@_inline_meta
+#                 Ref(EnzymeCore.make_zero(function_field_func.val))
+#             end
+#         end
+#     else
+#         nothing
+#     end
+# 
+#     shadow = if config_width == 1
+#         dfunction_field_func = if function_field_is_active
+#             dactives[]
+#         else
+#             function_field_func.dval
+#         end
+# 
+#         FunctionFieldType(dfunction_field_func, grid.val; clock, parameters)
+#     else
+#   	    ntuple(Val(config_width)) do i
+#   		    Base.@_inline_meta
+# 
+#             dfunction_field_func = if function_field_is_active
+#                 dactives[i][]
+#             else
+#                 function_field_func.dval[i]
+#             end
+# 
+#             FunctionFieldType(dfunction_field_func, grid.val; clock, parameters)
+#   	    end
+#     end
+# 
+#     P = EnzymeCore.EnzymeRules.needs_primal(config) ? RT : Nothing
+#     B = batch(Val(EnzymeCore.EnzymeRules.width(config)), RT)
+#     D = typeof(dactives)
+# 
+#     return EnzymeCore.EnzymeRules.AugmentedReturn{P, B, D}(primal, shadow, dactives)
+# end
+# 
+# function EnzymeCore.EnzymeRules.reverse(config,
+#                                         enzyme_func::Union{EnzymeCore.Const{<:Type{<:FunctionField}}, EnzymeCore.Const{Type{FT2}}},
+#                                         ::RT,
+#                                         tape,
+#                                         function_field_func,
+#                                         grid;
+#                                         clock = nothing,
+#                                         parameters = nothing) where {RT, FT2 <: FunctionField}
+# 
+#     dactives = if function_field_func isa Active
+#         if EnzymeCore.EnzymeRules.width(config) == 1
+#             tape[]
+#         else
+#             ntuple(Val(EnzymeCore.EnzymeRules.width(config))) do i
+#                 Base.@_inline_meta
+#                 tape[i][]
+#             end
+#         end
+#     else
+#         nothing
+#     end
+# 
+#     # return (dactives, grid (nothing))
+#     return (dactives, nothing)
+# end
 
 #####
 ##### launch!
 #####
 
-function EnzymeCore.EnzymeRules.augmented_primal(config,
-                                                 func::EnzymeCore.Const{typeof(Oceananigans.Models.flattened_unique_values)},
-                                                 ::Type{<:EnzymeCore.Annotation{RT}},
-                                                 a) where RT
-
-    sprimal = if EnzymeCore.EnzymeRules.needs_primal(config) || EnzymeCore.EnzymeRules.needs_shadow(config)
-        func.val(a.val)
-    else
-        nothing
-    end
-
-    shadow = if EnzymeCore.EnzymeRules.needs_shadow(config)
-        if EnzymeCore.EnzymeRules.width(config) == 1
-            (typeof(a) <: Const) ? EnzymeCore.make_zero(sprimal)::RT : func.val(a.dval)
-        else
-            ntuple(Val(EnzymeCore.EnzymeRules.width(config))) do i
-                Base.@_inline_meta
-                (typeof(a) <: Const) ? EnzymeCore.make_zero(sprimal)::RT : func.val(a.dval[i])
-            end
-        end
-    else
-        nothing
-    end
-
-    primal = if EnzymeCore.EnzymeRules.needs_primal(config)
-        sprimal
-    else
-        nothing
-    end
-
-    P = EnzymeCore.EnzymeRules.needs_primal(config) ? RT : Nothing
-    B = EnzymeCore.EnzymeRules.needs_primal(config) ? batch(Val(EnzymeCore.EnzymeRules.width(config)), RT) : Nothing
-
-    return EnzymeCore.EnzymeRules.AugmentedReturn{P, B, Nothing}(primal, shadow, nothing)
-end
-
-function EnzymeCore.EnzymeRules.reverse(config,
-                                        func::EnzymeCore.Const{typeof(Oceananigans.Models.flattened_unique_values)},
-                                         ::Type{<:EnzymeCore.Annotation{RT}},
-                                         tape,
-                                         a) where RT
-
-  return (nothing,)
-end
+# function EnzymeCore.EnzymeRules.augmented_primal(config,
+#                                                  func::EnzymeCore.Const{typeof(Oceananigans.Models.flattened_unique_values)},
+#                                                  ::Type{<:EnzymeCore.Annotation{RT}},
+#                                                  a) where RT
+# 
+#     sprimal = if EnzymeCore.EnzymeRules.needs_primal(config) || EnzymeCore.EnzymeRules.needs_shadow(config)
+#         func.val(a.val)
+#     else
+#         nothing
+#     end
+# 
+#     shadow = if EnzymeCore.EnzymeRules.needs_shadow(config)
+#         if EnzymeCore.EnzymeRules.width(config) == 1
+#             (typeof(a) <: Const) ? EnzymeCore.make_zero(sprimal)::RT : func.val(a.dval)
+#         else
+#             ntuple(Val(EnzymeCore.EnzymeRules.width(config))) do i
+#                 Base.@_inline_meta
+#                 (typeof(a) <: Const) ? EnzymeCore.make_zero(sprimal)::RT : func.val(a.dval[i])
+#             end
+#         end
+#     else
+#         nothing
+#     end
+# 
+#     primal = if EnzymeCore.EnzymeRules.needs_primal(config)
+#         sprimal
+#     else
+#         nothing
+#     end
+# 
+#     P = EnzymeCore.EnzymeRules.needs_primal(config) ? RT : Nothing
+#     B = EnzymeCore.EnzymeRules.needs_primal(config) ? batch(Val(EnzymeCore.EnzymeRules.width(config)), RT) : Nothing
+# 
+#     return EnzymeCore.EnzymeRules.AugmentedReturn{P, B, Nothing}(primal, shadow, nothing)
+# end
+# 
+# function EnzymeCore.EnzymeRules.reverse(config,
+#                                         func::EnzymeCore.Const{typeof(Oceananigans.Models.flattened_unique_values)},
+#                                          ::Type{<:EnzymeCore.Annotation{RT}},
+#                                          tape,
+#                                          a) where RT
+# 
+#   return (nothing,)
+# end
 
 #####
 ##### launch!
 #####
 
-function EnzymeCore.EnzymeRules.augmented_primal(config,
-                                                 func::EnzymeCore.Const{typeof(Oceananigans.Utils.launch!)},
-                                                 ::Type{EnzymeCore.Const{Nothing}},
-                                                 arch,
-                                                 grid,
-                                                 workspec,
-                                                 kernel!,
-                                                 kernel_args::Vararg{Any,N};
-                                                 include_right_boundaries = false,
-                                                 reduced_dimensions = (),
-                                                 location = nothing,
-                                                 active_cells_map = nothing,
-                                                 kwargs...) where N
-
-
-    workgroup, worksize = Oceananigans.Utils.work_layout(grid.val, workspec.val;
-                                                         include_right_boundaries,
-                                                         reduced_dimensions,
-                                                         location)
-
-    offset = Oceananigans.Utils.offsets(workspec.val)
-
-    if !isnothing(active_cells_map) 
-        workgroup, worksize = Oceananigans.Utils.active_cells_work_layout(workgroup, worksize, active_cells_map, grid.val) 
-        offset = nothing
-    end
-
-    if worksize != 0
-      
-      # We can only launch offset kernels with Static sizes!!!!
-
-      if isnothing(offset)
-          loop! = kernel!.val(Oceananigans.Architectures.device(arch.val), workgroup, worksize)
-          dloop! = (typeof(kernel!) <: EnzymeCore.Const) ? nothing : kernel!.dval(Oceananigans.Architectures.device(arch.val), workgroup, worksize)
-      else
-          loop! = kernel!.val(Oceananigans.Architectures.device(arch.val), KernelAbstractions.StaticSize(workgroup), Oceananigans.Utils.OffsetStaticSize(contiguousrange(worksize, offset))) 
-          dloop! = (typeof(kernel!) <: EnzymeCore.Const) ? nothing : kernel!.val(Oceananigans.Architectures.device(arch.val), KernelAbstractions.StaticSize(workgroup), Oceananigans.Utils.OffsetStaticSize(contiguousrange(worksize, offset)))
-      end
-
-      @debug "Launching kernel $kernel! with worksize $worksize and offsets $offset from $workspec.val"
-
-
-      duploop = (typeof(kernel!) <: EnzymeCore.Const) ? EnzymeCore.Const(loop!) : EnzymeCore.Duplicated(loop!, dloop!)
-
-      config2 = EnzymeCore.EnzymeRules.Config{#=needsprimal=#false, #=needsshadow=#false, #=width=#EnzymeCore.EnzymeRules.width(config), EnzymeCore.EnzymeRules.overwritten(config)[5:end]}()
-      subtape = EnzymeCore.EnzymeRules.augmented_primal(config2, duploop, EnzymeCore.Const{Nothing}, kernel_args...).tape
-
-      tape = (duploop, subtape)
-    else
-      tape = nothing
-    end
-
-    return EnzymeCore.EnzymeRules.AugmentedReturn{Nothing, Nothing, Any}(nothing, nothing, tape)
-end
-
-@inline arg_elem_type(::Type{T}, ::Val{i}) where {T<:Tuple, i} = eltype(T.parameters[i])
-
-function EnzymeCore.EnzymeRules.reverse(config::EnzymeCore.EnzymeRules.ConfigWidth{1},
-                                                func::EnzymeCore.Const{typeof(Oceananigans.Utils.launch!)},
-                                                 ::Type{EnzymeCore.Const{Nothing}},
-                                                 tape,
-                                                 arch,
-                                                 grid,
-                                                 workspec,
-                                                 kernel!,
-                                                 kernel_args::Vararg{Any,N};
-                                                 include_right_boundaries = false,
-                                                 reduced_dimensions = (),
-                                                 location = nothing,
-                                                 active_cells_map = nothing,
-                                                 kwargs...) where N
-
-  subrets = if tape !== nothing
-    duploop, subtape = tape
-    config2 = EnzymeCore.EnzymeRules.Config{#=needsprimal=#false, #=needsshadow=#false, #=width=#EnzymeCore.EnzymeRules.width(config), EnzymeCore.EnzymeRules.overwritten(config)[5:end]}()
-    EnzymeCore.EnzymeRules.reverse(config2, duploop, EnzymeCore.Const{Nothing}, subtape, kernel_args...)
-  else
-    res2 = ntuple(Val(N)) do i
-      Base.@_inline_meta
-      if kernel_args[i] isa Active
-        EnzymeCore.make_zero(kernel_args[i].val)
-      else
-        nothing
-      end
-    end
-  end
-
-  subrets2 =  ntuple(Val(N)) do i
-      Base.@_inline_meta
-      if kernel_args[i] isa Active
-        subrets[i]::arg_elem_type(typeof(kernel_args), Val(i))
-      else
-        nothing
-      end
-    end
-
-  return (nothing, nothing, nothing, nothing, subrets2...)
-
-end
+# function EnzymeCore.EnzymeRules.augmented_primal(config,
+#                                                  func::EnzymeCore.Const{typeof(Oceananigans.Utils.launch!)},
+#                                                  ::Type{EnzymeCore.Const{Nothing}},
+#                                                  arch,
+#                                                  grid,
+#                                                  workspec,
+#                                                  kernel!,
+#                                                  kernel_args::Vararg{Any,N};
+#                                                  include_right_boundaries = false,
+#                                                  reduced_dimensions = (),
+#                                                  location = nothing,
+#                                                  active_cells_map = nothing,
+#                                                  kwargs...) where N
+# 
+# 
+#     workgroup, worksize = Oceananigans.Utils.work_layout(grid.val, workspec.val;
+#                                                          include_right_boundaries,
+#                                                          reduced_dimensions,
+#                                                          location)
+# 
+#     offset = Oceananigans.Utils.offsets(workspec.val)
+# 
+#     if !isnothing(active_cells_map) 
+#         workgroup, worksize = Oceananigans.Utils.active_cells_work_layout(workgroup, worksize, active_cells_map, grid.val) 
+#         offset = nothing
+#     end
+# 
+#     if worksize != 0
+#       
+#       # We can only launch offset kernels with Static sizes!!!!
+# 
+#       if isnothing(offset)
+#           loop! = kernel!.val(Oceananigans.Architectures.device(arch.val), workgroup, worksize)
+#           dloop! = (typeof(kernel!) <: EnzymeCore.Const) ? nothing : kernel!.dval(Oceananigans.Architectures.device(arch.val), workgroup, worksize)
+#       else
+#           loop! = kernel!.val(Oceananigans.Architectures.device(arch.val), KernelAbstractions.StaticSize(workgroup), Oceananigans.Utils.OffsetStaticSize(contiguousrange(worksize, offset))) 
+#           dloop! = (typeof(kernel!) <: EnzymeCore.Const) ? nothing : kernel!.val(Oceananigans.Architectures.device(arch.val), KernelAbstractions.StaticSize(workgroup), Oceananigans.Utils.OffsetStaticSize(contiguousrange(worksize, offset)))
+#       end
+# 
+#       @debug "Launching kernel $kernel! with worksize $worksize and offsets $offset from $workspec.val"
+# 
+# 
+#       duploop = (typeof(kernel!) <: EnzymeCore.Const) ? EnzymeCore.Const(loop!) : EnzymeCore.Duplicated(loop!, dloop!)
+# 
+#       config2 = EnzymeCore.EnzymeRules.Config{#=needsprimal=#false, #=needsshadow=#false, #=width=#EnzymeCore.EnzymeRules.width(config), EnzymeCore.EnzymeRules.overwritten(config)[5:end]}()
+#       subtape = EnzymeCore.EnzymeRules.augmented_primal(config2, duploop, EnzymeCore.Const{Nothing}, kernel_args...).tape
+# 
+#       tape = (duploop, subtape)
+#     else
+#       tape = nothing
+#     end
+# 
+#     return EnzymeCore.EnzymeRules.AugmentedReturn{Nothing, Nothing, Any}(nothing, nothing, tape)
+# end
+# 
+# @inline arg_elem_type(::Type{T}, ::Val{i}) where {T<:Tuple, i} = eltype(T.parameters[i])
+# 
+# function EnzymeCore.EnzymeRules.reverse(config::EnzymeCore.EnzymeRules.ConfigWidth{1},
+#                                                 func::EnzymeCore.Const{typeof(Oceananigans.Utils.launch!)},
+#                                                  ::Type{EnzymeCore.Const{Nothing}},
+#                                                  tape,
+#                                                  arch,
+#                                                  grid,
+#                                                  workspec,
+#                                                  kernel!,
+#                                                  kernel_args::Vararg{Any,N};
+#                                                  include_right_boundaries = false,
+#                                                  reduced_dimensions = (),
+#                                                  location = nothing,
+#                                                  active_cells_map = nothing,
+#                                                  kwargs...) where N
+# 
+#   subrets = if tape !== nothing
+#     duploop, subtape = tape
+#     config2 = EnzymeCore.EnzymeRules.Config{#=needsprimal=#false, #=needsshadow=#false, #=width=#EnzymeCore.EnzymeRules.width(config), EnzymeCore.EnzymeRules.overwritten(config)[5:end]}()
+#     EnzymeCore.EnzymeRules.reverse(config2, duploop, EnzymeCore.Const{Nothing}, subtape, kernel_args...)
+#   else
+#     res2 = ntuple(Val(N)) do i
+#       Base.@_inline_meta
+#       if kernel_args[i] isa Active
+#         EnzymeCore.make_zero(kernel_args[i].val)
+#       else
+#         nothing
+#       end
+#     end
+#   end
+# 
+#   subrets2 =  ntuple(Val(N)) do i
+#       Base.@_inline_meta
+#       if kernel_args[i] isa Active
+#         subrets[i]::arg_elem_type(typeof(kernel_args), Val(i))
+#       else
+#         nothing
+#       end
+#     end
+# 
+#   return (nothing, nothing, nothing, nothing, subrets2...)
+# 
+# end
 
 #####
 ##### update_model_field_time_series!
 #####
 
-#=
 function EnzymeCore.EnzymeRules.augmented_primal(config,
                                                  func::EnzymeCore.Const{typeof(Oceananigans.Models.update_model_field_time_series!)},
                                                  ::Type{EnzymeCore.Const{Nothing}},
@@ -332,12 +345,12 @@ function EnzymeCore.EnzymeRules.augmented_primal(config,
 
     possible_fts = Oceananigans.Models.possible_field_time_series(model.val)
 
-    time_series_tuple = Oceananigans.Models.extract_field_timeseries(possible_fts)
+    time_series_tuple = Oceananigans.OutputReaders.extract_field_time_series(possible_fts)
     time_series_tuple = Oceananigans.Models.flattened_unique_values(time_series_tuple)
 
     fulltape = if EnzymeCore.EnzymeRules.width(config) == 1
         dpossible_fts = Oceananigans.Models.possible_field_time_series(model.dval)
-        dtime_series_tuple = Oceananigans.Models.extract_field_timeseries(possible_fts)
+        dtime_series_tuple = Oceananigans.OutputReaders.extract_field_time_series(possible_fts)
         dtime_series_tuple = Oceananigans.Models.flattened_unique_values(dtime_series_tuple)
 
         tapes = []
@@ -352,7 +365,7 @@ function EnzymeCore.EnzymeRules.augmented_primal(config,
         ntuple(Val(EnzymeCore.EnzymeRules.width(config))) do i
             Base.@_inline_meta
             dpossible_fts = Oceananigans.Models.possible_field_time_series(model.dval[i])
-            dtime_series_tuple = Oceananigans.Models.extract_field_timeseries(possible_fts)
+            dtime_series_tuple = Oceananigans.OutputReaders.extract_field_time_series(possible_fts)
             dtime_series_tuple = Oceananigans.Models.flattened_unique_values(dtime_series_tuple)
 
             tapes = []
@@ -377,10 +390,15 @@ function EnzymeCore.EnzymeRules.reverse(config,
                                         clock)
 
     time = (typeof(clock) <: EnzymeCore.Const) ? Const(Oceananigans.Utils.Time(clock.val.time)) : Duplicated(Oceananigans.Utils.Time(clock.val.time), Oceananigans.Utils.Time(clock.dval.time))
+    
+    possible_fts = Oceananigans.Models.possible_field_time_series(model.val)
+
+    time_series_tuple = Oceananigans.OutputReaders.extract_field_time_series(possible_fts)
+    time_series_tuple = Oceananigans.Models.flattened_unique_values(time_series_tuple)
 
     if EnzymeCore.EnzymeRules.width(config) == 1
         dpossible_fts = Oceananigans.Models.possible_field_time_series(model.dval)
-        dtime_series_tuple = Oceananigans.Models.extract_field_timeseries(possible_fts)
+        dtime_series_tuple = Oceananigans.OutputReaders.extract_field_time_series(dpossible_fts)
         dtime_series_tuple = Oceananigans.Models.flattened_unique_values(dtime_series_tuple)
 
         tapes = fulltape
@@ -389,7 +407,6 @@ function EnzymeCore.EnzymeRules.reverse(config,
             dupft = Enzyme.Compiler.guaranteed_const(typeof(fts)) ? Const(fts) : Duplicated(fts, dfts)
             fwdfn, revfn = Enzyme.autodiff_thunk(EnzymeCore.ReverseSplitNoPrimal, Const{typeof(Oceananigans.Models.update_field_time_series!)}, Const, typeof(dupft), typeof(time))
             revfn(Const(Oceananigans.Models.update_field_time_series!), dupft, time, tapes[i])
-            push!(tapes, tape)
             i+= 1
         end
     else
@@ -398,7 +415,7 @@ function EnzymeCore.EnzymeRules.reverse(config,
 
             tapes = fulltape[i]
             dpossible_fts = Oceananigans.Models.possible_field_time_series(model.dval[i])
-            dtime_series_tuple = Oceananigans.Models.extract_field_timeseries(possible_fts)
+            dtime_series_tuple = Oceananigans.OutputReaders.extract_field_time_series(dpossible_fts)
             dtime_series_tuple = Oceananigans.Models.flattened_unique_values(dtime_series_tuple)
 
             i += 1
@@ -415,8 +432,6 @@ function EnzymeCore.EnzymeRules.reverse(config,
 
 end
 
-=#
-
 #####
 ##### top_tracer_boundary_conditions
 #####
diff --git a/ext/OceananigansMakieExt.jl b/ext/OceananigansMakieExt.jl
new file mode 100644
index 0000000000..9c8fe12bad
--- /dev/null
+++ b/ext/OceananigansMakieExt.jl
@@ -0,0 +1,140 @@
+module OceananigansMakieExt
+
+using Oceananigans
+using Oceananigans.AbstractOperations: AbstractOperation
+using Oceananigans.Architectures: on_architecture
+using Oceananigans.ImmersedBoundaries: mask_immersed_field!
+
+using MakieCore: AbstractPlot
+import MakieCore: convert_arguments, _create_plot
+import Makie: args_preferred_axis
+
+# Extending args_preferred_axis here ensures that Field
+# do not overstate a preference for being plotted in a 3D LScene.
+# Because often we are trying to plot 1D and 2D Field, even though
+# (perhaps incorrectly) all Field are AbstractArray{3}.
+args_preferred_axis(::Field) = nothing
+
+function drop_singleton_indices(N)
+    if N == 1
+        return 1
+    else
+        return Colon()
+    end
+end
+
+function _create_plot(F::Function, attributes::Dict, f::Field)
+    converted_args = convert_field_argument(f)
+    return _create_plot(F, attributes, converted_args...)
+end
+
+function _create_plot(F::Function, attributes::Dict, op::AbstractOperation)
+    f = Field(op)
+    compute!(f)
+    return _create_plot(F::Function, attributes::Dict, f)
+end
+
+convert_arguments(pl::Type{<:AbstractPlot}, f::Field) =
+    convert_arguments(pl, convert_field_argument(f)...)
+
+function convert_arguments(pl::Type{<:AbstractPlot}, fop::AbstractOperation)
+    f = Field(op)
+    compute!(f)
+    return convert_arguments(pl, f)
+end
+
+function convert_arguments(pl::Type{<:AbstractPlot}, ξ1::AbstractArray, op::AbstractOperation)
+    f = Field(op)
+    compute!(f)
+    return convert_arguments(pl, ξ1, f)
+end
+
+function convert_arguments(pl::Type{<:AbstractPlot}, ξ1::AbstractArray, ξ2::AbstractArray, op::AbstractOperation)
+    f = Field(op)
+    compute!(f)
+    return convert_arguments(pl, ξ1, ξ2, f)
+end
+
+"""
+    make_plottable_array(f)
+
+Convert a field `f` to an array that can be plotted with Makie by
+
+- masking immersed cells (for fields on immersed boundary
+grids) with NaNs;
+- dropping singleton dimensions, and
+- transferring data from GPU to CPU if necessary.
+"""
+function make_plottable_array(f)
+    compute!(f)
+    mask_immersed_field!(f, NaN)
+
+    Nx, Ny, Nz = size(f)
+
+    ii = drop_singleton_indices(Nx)
+    jj = drop_singleton_indices(Ny)
+    kk = drop_singleton_indices(Nz)
+
+    fi = interior(f, ii, jj, kk)
+    fi_cpu = on_architecture(CPU(), fi)
+
+    return fi_cpu
+end
+
+function convert_field_argument(f::Field)
+
+    fi_cpu = make_plottable_array(f)
+
+    # Indices of the non-zero dimensions
+    d1 = findfirst(n -> n > 1, size(f))
+    d2 =  findlast(n -> n > 1, size(f))
+    
+    # Nodes shenanigans
+    fnodes = nodes(f)
+
+    # Deduce dimensionality
+    Nx, Ny, Nz = size(f)
+    D = (Nx > 1) + (Ny > 1) + (Nz > 1)
+
+    if D == 1
+
+        ξ1 = fnodes[d1]
+        ξ1_cpu = on_architecture(CPU(), ξ1)
+
+        # Shenanigans
+        if d1 === 1 # horizontal plot, in x
+            return ξ1_cpu, fi_cpu
+        else # vertical plot instead
+            return fi_cpu, ξ1_cpu
+        end
+
+    elseif D == 2
+
+        ξ1 = fnodes[d1]
+        ξ2 = fnodes[d2]
+
+        ξ1_cpu = on_architecture(CPU(), ξ1)
+        ξ2_cpu = on_architecture(CPU(), ξ2)
+
+        return ξ1_cpu, ξ2_cpu, fi_cpu
+
+    elseif D == 3
+        throw(ArgumentError("Cannot convert_arguments for a 3D field!"))
+    end
+end
+
+#####
+##### When nodes are provided
+#####
+
+function convert_arguments(pl::Type{<:AbstractPlot}, ξ1::AbstractArray, f::Field)
+    fi_cpu = make_plottable_array(f)
+    return convert_arguments(pl, ξ1, fi_cpu)
+end
+
+function convert_arguments(pl::Type{<:AbstractPlot}, ξ1::AbstractArray, ξ2::AbstractArray, f::Field)
+    fi_cpu = make_plottable_array(f)
+    return convert_arguments(pl, ξ1, ξ2, fi_cpu)
+end
+
+end # module
diff --git a/src/AbstractOperations/AbstractOperations.jl b/src/AbstractOperations/AbstractOperations.jl
index 390597cc6c..576f633179 100644
--- a/src/AbstractOperations/AbstractOperations.jl
+++ b/src/AbstractOperations/AbstractOperations.jl
@@ -34,15 +34,6 @@ abstract type AbstractOperation{LX, LY, LZ, G, T} <: AbstractField{LX, LY, LZ, G
 
 const AF = AbstractField # used in unary_operations.jl, binary_operations.jl, etc
 
-function Base.axes(f::AbstractOperation)
-    idx = indices(f)
-    if idx === (:, : ,:)
-        return Base.OneTo.(size(f))
-    else
-        return Tuple(idx[i] isa Colon ? Base.OneTo(size(f, i)) : idx[i] for i = 1:3)
-    end
-end
-
 # We have no halos to fill
 @inline fill_halo_regions!(::AbstractOperation, args...; kwargs...) = nothing
 
diff --git a/src/AbstractOperations/conditional_operations.jl b/src/AbstractOperations/conditional_operations.jl
index 3b8ccc2879..7eba3699f7 100644
--- a/src/AbstractOperations/conditional_operations.jl
+++ b/src/AbstractOperations/conditional_operations.jl
@@ -12,7 +12,8 @@ struct ConditionalOperation{LX, LY, LZ, O, F, G, C, M, T} <: AbstractOperation{L
     condition :: C
     mask :: M
 
-    function ConditionalOperation{LX, LY, LZ}(operand::O, func::F, grid::G, condition::C, mask::M) where {LX, LY, LZ, O, F, G, C, M}
+    function ConditionalOperation{LX, LY, LZ}(operand::O, func::F, grid::G,
+                                              condition::C, mask::M) where {LX, LY, LZ, O, F, G, C, M}
         T = eltype(operand)
         return new{LX, LY, LZ, O, F, G, C, M, T}(operand, func, grid, condition, mask)
     end
@@ -58,25 +59,28 @@ julia> using Oceananigans.Fields: condition_operand
 
 julia> c = CenterField(RectilinearGrid(size=(2, 1, 1), extent=(1, 1, 1)));
 
-julia> f(i, j, k, grid, c) = i < 2; d = condition_operand(cos, c, f, 10)
+julia> add_2(c) = c + 2
+add_2 (generic function with 1 method)
+
+julia> f(i, j, k, grid, c) = i < 2; d = condition_operand(add_2, c, f, 10.0)
 ConditionalOperation at (Center, Center, Center)
 ├── operand: 2×1×1 Field{Center, Center, Center} on RectilinearGrid on CPU
 ├── grid: 2×1×1 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 2×1×1 halo
-├── func: cos (generic function with 40 methods)
+├── func: add_2 (generic function with 1 method)
 ├── condition: f (generic function with 1 method)
-└── mask: 10
+└── mask: 10.0
 
 julia> d[1, 1, 1]
-1.0
+2.0
 
 julia> d[2, 1, 1]
-10
+10.0
 ```
 """
 function ConditionalOperation(operand::AbstractField;
                               func = identity,
                               condition = nothing,
-                              mask = 0)
+                              mask = zero(eltype(operand)))
 
     LX, LY, LZ = location(operand)
     return ConditionalOperation{LX, LY, LZ}(operand, func, operand.grid, condition, mask)
diff --git a/src/AbstractOperations/grid_metrics.jl b/src/AbstractOperations/grid_metrics.jl
index c7bee46871..f873242aa1 100644
--- a/src/AbstractOperations/grid_metrics.jl
+++ b/src/AbstractOperations/grid_metrics.jl
@@ -114,7 +114,7 @@ function metric_function(loc, metric::AbstractGridMetric)
     code = Tuple(interpolation_code(ℓ) for ℓ in loc)
     prefix = metric_function_prefix(metric)
     metric_function_symbol = Symbol(prefix, code...)
-    return eval(metric_function_symbol)
+    return getglobal(@__MODULE__, metric_function_symbol)
 end
 
 struct GridMetricOperation{LX, LY, LZ, G, T, M} <: AbstractOperation{LX, LY, LZ, G, T}
diff --git a/src/Advection/Advection.jl b/src/Advection/Advection.jl
index b8d7d96469..c54b310475 100644
--- a/src/Advection/Advection.jl
+++ b/src/Advection/Advection.jl
@@ -21,6 +21,7 @@ export
     UpwindBiased, UpwindBiasedFirstOrder, UpwindBiasedThirdOrder, UpwindBiasedFifthOrder,
     WENO, WENOThirdOrder, WENOFifthOrder,
     VectorInvariant, WENOVectorInvariant,
+    TracerAdvection,
     EnergyConserving,
     EnstrophyConserving
 
@@ -69,9 +70,8 @@ include("stretched_weno_smoothness.jl")
 include("multi_dimensional_reconstruction.jl")
 include("vector_invariant_upwinding.jl")
 include("vector_invariant_advection.jl")
-include("vector_invariant_cross_upwinding.jl")
 include("vector_invariant_self_upwinding.jl")
-include("vector_invariant_velocity_upwinding.jl")
+include("vector_invariant_cross_upwinding.jl")
 
 include("flat_advective_fluxes.jl")
 include("topologically_conditional_interpolation.jl")
diff --git a/src/Advection/cell_advection_timescale.jl b/src/Advection/cell_advection_timescale.jl
index 4c7e3897fc..170dd76932 100644
--- a/src/Advection/cell_advection_timescale.jl
+++ b/src/Advection/cell_advection_timescale.jl
@@ -16,12 +16,19 @@ function cell_advection_timescale(grid, velocities)
     return minimum(τ)
 end
 
-@inline function cell_advection_timescaleᶜᶜᶜ(i, j, k, grid, u, v, w)
+@inline _inverse_timescale(i, j, k, Δ, U, topo) = @inbounds abs(U[i, j, k]) / Δ
+@inline _inverse_timescale(i, j, k, Δ, U, topo::Flat) = 0
+
+@inline function cell_advection_timescaleᶜᶜᶜ(i, j, k, grid::AbstractGrid{FT, TX, TY, TZ}, u, v, w) where {FT, TX, TY, TZ}
     Δx = Δxᶠᶜᶜ(i, j, k, grid)
     Δy = Δyᶜᶠᶜ(i, j, k, grid)
     Δz = Δzᶜᶜᶠ(i, j, k, grid)
 
-    inverse_timescale = @inbounds abs(u[i, j, k]) / Δx + abs(v[i, j, k]) / Δy + abs(w[i, j, k]) / Δz
+    inverse_timescale_x = _inverse_timescale(i, j, k, Δx, u, TX())
+    inverse_timescale_y = _inverse_timescale(i, j, k, Δy, v, TY())
+    inverse_timescale_z = _inverse_timescale(i, j, k, Δz, w, TZ())
+    
+    inverse_timescale = inverse_timescale_x + inverse_timescale_y + inverse_timescale_z
      
     return 1 / inverse_timescale
 end
diff --git a/src/Advection/centered_reconstruction.jl b/src/Advection/centered_reconstruction.jl
index b6f5010f80..5884d37b5a 100644
--- a/src/Advection/centered_reconstruction.jl
+++ b/src/Advection/centered_reconstruction.jl
@@ -91,21 +91,14 @@ CenteredFourthOrder(grid=nothing, FT::DataType=Float64) = Centered(grid, FT; ord
 const ACAS = AbstractCenteredAdvectionScheme
 
 # left and right biased for Centered reconstruction are just symmetric!
-@inline left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::ACAS, c, args...) = symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, c, args...)
-@inline left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::ACAS, c, args...) = symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, c, args...)
-@inline left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::ACAS, c, args...) = symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, c, args...)
+@inline biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::ACAS, bias, c, args...) = symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, c, args...)
+@inline biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::ACAS, bias, c, args...) = symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, c, args...)
+@inline biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::ACAS, bias, c, args...) = symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, c, args...)
 
-@inline right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::ACAS, c, args...) = symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, c, args...)
-@inline right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::ACAS, c, args...) = symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, c, args...)
-@inline right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::ACAS, c, args...) = symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, c, args...)
-
-@inline left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme::ACAS, u, args...) = symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, u, args...)
-@inline left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme::ACAS, v, args...) = symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, v, args...)
-@inline left_biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme::ACAS, w, args...) = symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, w, args...)
-
-@inline right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme::ACAS, u, args...) = symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, u, args...)
-@inline right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme::ACAS, v, args...) = symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, v, args...)
-@inline right_biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme::ACAS, w, args...) = symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, w, args...)
+# left and right biased for Centered reconstruction are just symmetric!
+@inline biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme::ACAS, bias, c, args...) = symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, c, args...)
+@inline biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme::ACAS, bias, c, args...) = symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, c, args...)
+@inline biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme::ACAS, bias, c, args...) = symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, c, args...)
 
 # uniform centered reconstruction
 for buffer in advection_buffers
diff --git a/src/Advection/positivity_preserving_tracer_advection_operators.jl b/src/Advection/positivity_preserving_tracer_advection_operators.jl
index 0f07db7dab..ff652ebabb 100644
--- a/src/Advection/positivity_preserving_tracer_advection_operators.jl
+++ b/src/Advection/positivity_preserving_tracer_advection_operators.jl
@@ -29,10 +29,10 @@ end
 
     cᵢⱼ = @inbounds c[i, j, k]
 
-    c₊ᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i+1, j, k, grid, advection, c)
-    c₊ᴿ = _right_biased_interpolate_xᶠᵃᵃ(i+1, j, k, grid, advection, c)
-    c₋ᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i,   j, k, grid, advection, c)
-    c₋ᴿ = _right_biased_interpolate_xᶠᵃᵃ(i,   j, k, grid, advection, c)
+    c₊ᴸ = _biased_interpolate_xᶠᵃᵃ(i+1, j, k, grid, advection, LeftBias(),  c)
+    c₊ᴿ = _biased_interpolate_xᶠᵃᵃ(i+1, j, k, grid, advection, RightBias(), c)
+    c₋ᴸ = _biased_interpolate_xᶠᵃᵃ(i,   j, k, grid, advection, LeftBias(),  c)
+    c₋ᴿ = _biased_interpolate_xᶠᵃᵃ(i,   j, k, grid, advection, RightBias(), c)
 
     p̃   = (cᵢⱼ - ω̂₁ * c₋ᴿ - ω̂ₙ * c₊ᴸ) / (1 - 2ω̂₁)
     M   = max(p̃, c₊ᴸ, c₋ᴿ) 
@@ -53,10 +53,10 @@ end
 
     cᵢⱼ = @inbounds c[i, j, k]
 
-    c₊ᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j+1, k, grid, advection, c)
-    c₊ᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j+1, k, grid, advection, c)
-    c₋ᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j,   k, grid, advection, c)
-    c₋ᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j,   k, grid, advection, c)
+    c₊ᴸ = _biased_interpolate_yᵃᶠᵃ(i, j+1, k, grid, advection, LeftBias(),  c)
+    c₊ᴿ = _biased_interpolate_yᵃᶠᵃ(i, j+1, k, grid, advection, RightBias(), c)
+    c₋ᴸ = _biased_interpolate_yᵃᶠᵃ(i, j,   k, grid, advection, LeftBias(),  c)
+    c₋ᴿ = _biased_interpolate_yᵃᶠᵃ(i, j,   k, grid, advection, RightBias(), c)
 
     p̃   =  (cᵢⱼ - ω̂₁ * c₋ᴿ - ω̂ₙ * c₊ᴸ) / (1 - 2ω̂₁)
     M   = max(p̃, c₊ᴸ, c₋ᴿ) 
@@ -77,10 +77,10 @@ end
 
     cᵢⱼ = @inbounds c[i, j, k]
 
-    c₊ᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k+1, grid, advection, c)
-    c₊ᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k+1, grid, advection, c)
-    c₋ᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k,   grid, advection, c)
-    c₋ᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k,   grid, advection, c)
+    c₊ᴸ = _biased_interpolate_zᵃᵃᶠ(i, j, k+1, grid, advection, LeftBias(),  c)
+    c₊ᴿ = _biased_interpolate_zᵃᵃᶠ(i, j, k+1, grid, advection, RightBias(), c)
+    c₋ᴸ = _biased_interpolate_zᵃᵃᶠ(i, j, k,   grid, advection, LeftBias(),  c)
+    c₋ᴿ = _biased_interpolate_zᵃᵃᶠ(i, j, k,   grid, advection, RightBias(), c)
 
     p̃   =  (cᵢⱼ - ω̂₁ * c₋ᴿ - ω̂ₙ * c₊ᴸ) / (1 - 2ω̂₁)
     M   = max(p̃, c₊ᴸ, c₋ᴿ) 
diff --git a/src/Advection/reconstruction_coefficients.jl b/src/Advection/reconstruction_coefficients.jl
index adcf18c925..86a9affd40 100644
--- a/src/Advection/reconstruction_coefficients.jl
+++ b/src/Advection/reconstruction_coefficients.jl
@@ -1,28 +1,69 @@
 # Generic reconstruction methods valid for all reconstruction schemes
 # Unroll the functions to pass the coordinates in case of a stretched grid
+"""
+    @inline symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, ψ, args...)
+
+high order centered reconstruction of variable ψ in the x-direction. ψ can be a `Function`
+with signature `ψ(i, j, k, grid, args...)` or an `AbstractArray`
+"""
 @inline symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, ψ, args...) = inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, ψ, i, Face, args...)
+
+"""
+    @inline symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, ψ, args...)
+
+high order centered reconstruction of variable ψ in the y-direction. ψ can be a `Function`
+with signature `ψ(i, j, k, grid, args...)` or an `AbstractArray`
+"""
 @inline symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, ψ, args...) = inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, ψ, j, Face, args...)
+
+"""
+    @inline symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, ψ, args...)
+
+high order centered reconstruction of variable ψ in the z-direction. ψ can be a `Function`
+with signature `ψ(i, j, k, grid, args...)` or an `AbstractArray`
+"""
 @inline symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, ψ, args...) = inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, ψ, k, Face, args...)
 
+""" same as [`symmetric_interpolate_xᶠᵃᵃ`](@ref) but on `Center`s instead of `Face`s """
 @inline symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, ψ, args...) = inner_symmetric_interpolate_xᶠᵃᵃ(i+1, j, k, grid, scheme, ψ, i, Center, args...)
+""" same as [`symmetric_interpolate_yᵃᶠᵃ`](@ref) but on `Center`s instead of `Face`s """
 @inline symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, ψ, args...) = inner_symmetric_interpolate_yᵃᶠᵃ(i, j+1, k, grid, scheme, ψ, j, Center, args...)
+""" same as [`symmetric_interpolate_zᵃᵃᶠ`](@ref) but on `Center`s instead of `Face`s """
 @inline symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, ψ, args...) = inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k+1, grid, scheme, ψ, k, Center, args...)
 
-@inline left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, ψ, args...)  = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, ψ, i, Face, args...)
-@inline left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, ψ, args...)  = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, ψ, j, Face, args...)
-@inline left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, ψ, args...)  = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, ψ, k, Face, args...)
+"""
+    @inline biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, bias, ψ, args...)
 
-@inline right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, ψ, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, ψ, i, Face, args...)
-@inline right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, ψ, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, ψ, j, Face, args...)
-@inline right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, ψ, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, ψ, k, Face, args...)
+high order biased reconstruction of variable ψ in the x-direction. ψ can be a `Function`
+with signature `ψ(i, j, k, grid, args...)` or an `AbstractArray`. The `bias` argument is
+either `LeftBias` for a left biased reconstruction, or `RightBias` for a right biased reconstruction
+"""
+@inline biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, bias, ψ, args...)  = inner_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, bias, ψ, i, Face, args...)
 
-@inline left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, ψ, args...)  = inner_left_biased_interpolate_xᶠᵃᵃ(i+1, j, k, grid, scheme, ψ, i, Center, args...)
-@inline left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, ψ, args...)  = inner_left_biased_interpolate_yᵃᶠᵃ(i, j+1, k, grid, scheme, ψ, j, Center, args...)
-@inline left_biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, ψ, args...)  = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k+1, grid, scheme, ψ, k, Center, args...)
+"""
+    @inline biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, bias, ψ, args...)
 
-@inline right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, ψ, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i+1, j, k, grid, scheme, ψ, i, Center, args...)
-@inline right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, ψ, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j+1, k, grid, scheme, ψ, j, Center, args...)
-@inline right_biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, ψ, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k+1, grid, scheme, ψ, k, Center, args...)
+high order biased reconstruction of variable ψ in the y-direction. ψ can be a `Function`
+with signature `ψ(i, j, k, grid, args...)` or an `AbstractArray`. The `bias` argument is
+either `LeftBias` for a left biased reconstruction, or `RightBias` for a right biased reconstruction
+"""
+@inline biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, bias, ψ, args...)  = inner_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, bias, ψ, j, Face, args...)
+
+"""
+    @inline biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, bias, ψ, args...)
+
+high order biased reconstruction of variable ψ in the z-direction. ψ can be a `Function`
+with signature `ψ(i, j, k, grid, args...)` or an `AbstractArray`. The `bias` argument is
+either `LeftBias` for a left biased reconstruction, or `RightBias` for a right biased reconstruction
+"""
+@inline biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, bias, ψ, args...)  = inner_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, bias, ψ, k, Face, args...)
+
+""" same as [`biased_interpolate_xᶠᵃᵃ`](@ref) but on `Center`s instead of `Face`s """
+@inline biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, bias, ψ, args...)  = inner_biased_interpolate_xᶠᵃᵃ(i+1, j, k, grid, scheme, bias, ψ, i, Center, args...)
+""" same as [`biased_interpolate_yᵃᶠᵃ`](@ref) but on `Center`s instead of `Face`s """
+@inline biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, bias, ψ, args...)  = inner_biased_interpolate_yᵃᶠᵃ(i, j+1, k, grid, scheme, bias, ψ, j, Center, args...)
+""" same as [`biased_interpolate_zᵃᵃᶠ`](@ref) but on `Center`s instead of `Face`s """
+@inline biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, bias, ψ, args...)  = inner_biased_interpolate_zᵃᵃᶠ(i, j, k+1, grid, scheme, bias, ψ, k, Center, args...)
 
 struct FirstDerivative end
 struct SecondDerivative end
@@ -209,40 +250,43 @@ end
 @inline function compute_reconstruction_coefficients(grid, FT, scheme; order)
 
     method = scheme == :Centered ? 1 : scheme == :Upwind ? 2 : 3
-
-    rect_metrics = (:xᶠᵃᵃ, :xᶜᵃᵃ, :yᵃᶠᵃ, :yᵃᶜᵃ, :zᵃᵃᶠ, :zᵃᵃᶜ)
-
+    
     if grid isa Nothing
-        for metric in rect_metrics
-            @eval $(Symbol(:coeff_ , metric)) = nothing
-            @eval $(Symbol(:smooth_, metric)) = nothing
-        end
+        coeff_xᶠᵃᵃ = nothing
+        coeff_xᶜᵃᵃ = nothing
+        coeff_yᵃᶠᵃ = nothing
+        coeff_yᵃᶜᵃ = nothing
+        coeff_zᵃᵃᶠ = nothing
+        coeff_zᵃᵃᶜ = nothing
     else
-        metrics = coordinates(grid)
-        dirsize = (:Nx, :Nx, :Ny, :Ny, :Nz, :Nz)
-
         arch       = architecture(grid)
         Hx, Hy, Hz = halo_size(grid)
         new_grid   = with_halo((Hx+1, Hy+1, Hz+1), grid)
-
-        for (dir, metric, rect_metric) in zip(dirsize, metrics, rect_metrics)
-            @eval $(Symbol(:coeff_ , rect_metric)) = calc_reconstruction_coefficients($FT, $new_grid.$metric, $arch, $new_grid.$dir, Val($method); order = $order)
-        end
+        metrics    = coordinates(grid)
+    
+        coeff_xᶠᵃᵃ = reconstruction_coefficients(FT, getproperty(new_grid, metrics[1]), arch, new_grid.Nx, Val(method); order)
+        coeff_xᶜᵃᵃ = reconstruction_coefficients(FT, getproperty(new_grid, metrics[2]), arch, new_grid.Nx, Val(method); order)
+        coeff_yᵃᶠᵃ = reconstruction_coefficients(FT, getproperty(new_grid, metrics[3]), arch, new_grid.Ny, Val(method); order)
+        coeff_yᵃᶜᵃ = reconstruction_coefficients(FT, getproperty(new_grid, metrics[4]), arch, new_grid.Ny, Val(method); order)
+        coeff_zᵃᵃᶠ = reconstruction_coefficients(FT, getproperty(new_grid, metrics[5]), arch, new_grid.Nz, Val(method); order)
+        coeff_zᵃᵃᶜ = reconstruction_coefficients(FT, getproperty(new_grid, metrics[6]), arch, new_grid.Nz, Val(method); order)
     end
 
     return (coeff_xᶠᵃᵃ, coeff_xᶜᵃᵃ, coeff_yᵃᶠᵃ, coeff_yᵃᶜᵃ, coeff_zᵃᵃᶠ, coeff_zᵃᵃᶜ)
 end
 
-# Fallback for uniform directions
+# Fallbacks for uniform or Flat directions
 for val in [1, 2, 3]
     @eval begin
-        @inline calc_reconstruction_coefficients(FT, coord::OffsetArray{<:Any, <:Any, <:AbstractRange}, arch, N, ::Val{$val}; order) = nothing
-        @inline calc_reconstruction_coefficients(FT, coord::AbstractRange, arch, N, ::Val{$val}; order)                              = nothing
+        @inline reconstruction_coefficients(FT, coord::OffsetArray{<:Any, <:Any, <:AbstractRange}, arch, N, ::Val{$val}; order) = nothing
+        @inline reconstruction_coefficients(FT, coord::AbstractRange, arch, N, ::Val{$val}; order)                              = nothing
+        @inline reconstruction_coefficients(FT, coord::Nothing, arch, N, ::Val{$val}; order)                                    = nothing
+        @inline reconstruction_coefficients(FT, coord::Number, arch, N, ::Val{$val}; order)                                     = nothing
     end
 end
 
 # Stretched reconstruction coefficients for `Centered` schemes
-@inline function calc_reconstruction_coefficients(FT, coord, arch, N, ::Val{1}; order) 
+@inline function reconstruction_coefficients(FT, coord, arch, N, ::Val{1}; order) 
     cpu_coord = on_architecture(CPU(), coord)
     r = ((order + 1) ÷ 2) - 1
     s = create_reconstruction_coefficients(FT, r, cpu_coord, arch, N; order)
@@ -250,7 +294,7 @@ end
 end
 
 # Stretched reconstruction coefficients for `UpwindBiased` schemes
-@inline function calc_reconstruction_coefficients(FT, coord, arch, N, ::Val{2}; order) 
+@inline function reconstruction_coefficients(FT, coord, arch, N, ::Val{2}; order) 
     cpu_coord = on_architecture(CPU(), coord)
     rleft  = ((order + 1) ÷ 2) - 2
     rright = ((order + 1) ÷ 2) - 1
@@ -262,8 +306,7 @@ end
 end
 
 # Stretched reconstruction coefficients for `WENO` schemes
-@inline function calc_reconstruction_coefficients(FT, coord, arch, N, ::Val{3}; order) 
-
+@inline function reconstruction_coefficients(FT, coord, arch, N, ::Val{3}; order) 
     cpu_coord = on_architecture(CPU(), coord)
     s = []
     for r in -1:order-1
diff --git a/src/Advection/topologically_conditional_interpolation.jl b/src/Advection/topologically_conditional_interpolation.jl
index 3c55d9b73c..094f5a7253 100644
--- a/src/Advection/topologically_conditional_interpolation.jl
+++ b/src/Advection/topologically_conditional_interpolation.jl
@@ -26,12 +26,13 @@ const AUGXYZ = AUG{<:Any, <:Bounded, <:Bounded, <:Bounded}
 # Left-biased buffers are smaller by one grid point on the right side; vice versa for right-biased buffers
 # Center interpolation stencil look at i + 1 (i.e., require one less point on the left)
 
-@inline    outside_symmetric_haloᶠ(i, N, adv) = (i >= required_halo_size(adv) + 1) & (i <= N + 1 - required_halo_size(adv))
-@inline    outside_symmetric_haloᶜ(i, N, adv) = (i >= required_halo_size(adv))     & (i <= N + 1 - required_halo_size(adv))
-@inline  outside_left_biased_haloᶠ(i, N, adv) = (i >= required_halo_size(adv) + 1) & (i <= N + 1 - (required_halo_size(adv) - 1))
-@inline  outside_left_biased_haloᶜ(i, N, adv) = (i >= required_halo_size(adv))     & (i <= N + 1 - (required_halo_size(adv) - 1))
-@inline outside_right_biased_haloᶠ(i, N, adv) = (i >= required_halo_size(adv))     & (i <= N + 1 - required_halo_size(adv))
-@inline outside_right_biased_haloᶜ(i, N, adv) = (i >= required_halo_size(adv) - 1) & (i <= N + 1 - required_halo_size(adv))
+@inline outside_symmetric_haloᶠ(i, N, adv) = (i >= required_halo_size(adv) + 1) & (i <= N + 1 - required_halo_size(adv))
+@inline outside_symmetric_haloᶜ(i, N, adv) = (i >= required_halo_size(adv))     & (i <= N + 1 - required_halo_size(adv))
+
+@inline outside_biased_haloᶠ(i, N, adv) = (i >= required_halo_size(adv) + 1) & (i <= N + 1 - (required_halo_size(adv) - 1)) &  # Left bias
+                                          (i >= required_halo_size(adv))     & (i <= N + 1 - required_halo_size(adv))          # Right bias
+@inline outside_biased_haloᶜ(i, N, adv) = (i >= required_halo_size(adv))     & (i <= N + 1 - (required_halo_size(adv) - 1)) &  # Left bias
+                                          (i >= required_halo_size(adv) - 1) & (i <= N + 1 - required_halo_size(adv))          # Right bias
 
 # Separate High order advection from low order advection
 const HOADV = Union{WENO, 
@@ -39,25 +40,25 @@ const HOADV = Union{WENO,
                     Tuple(UpwindBiased{N} for N in advection_buffers[2:end])...} 
 const LOADV = Union{UpwindBiased{1}, Centered{1}}
 
-for bias in (:symmetric, :left_biased, :right_biased)
-
+for bias in (:symmetric, :biased)
     for (d, ξ) in enumerate((:x, :y, :z))
 
         code = [:ᵃ, :ᵃ, :ᵃ]
 
-        for loc in (:ᶜ, :ᶠ)
+        for loc in (:ᶜ, :ᶠ), (alt1, alt2) in zip((:_, :__, :___, :____, :_____), (:_____, :_, :__, :___, :____))
             code[d] = loc
             second_order_interp = Symbol(:ℑ, ξ, code...)
             interp = Symbol(bias, :_interpolate_, ξ, code...)
-            alt_interp = Symbol(:_, interp)
+            alt1_interp = Symbol(alt1, interp)
+            alt2_interp = Symbol(alt2, interp)
 
             # Simple translation for Periodic directions and low-order advection schemes (fallback)
-            @eval @inline $alt_interp(i, j, k, grid::AUG, scheme::LOADV, args...) = $interp(i, j, k, grid, scheme, args...)
-            @eval @inline $alt_interp(i, j, k, grid::AUG, scheme::HOADV, args...) = $interp(i, j, k, grid, scheme, args...)
+            @eval @inline $alt1_interp(i, j, k, grid::AUG, scheme::HOADV, args...) = $interp(i, j, k, grid, scheme, args...)
+            @eval @inline $alt1_interp(i, j, k, grid::AUG, scheme::LOADV, args...) = $interp(i, j, k, grid, scheme, args...)
 
             # Disambiguation
             for GridType in [:AUGX, :AUGY, :AUGZ, :AUGXY, :AUGXZ, :AUGYZ, :AUGXYZ]
-                @eval @inline $alt_interp(i, j, k, grid::$GridType, scheme::LOADV, args...) = $interp(i, j, k, grid, scheme, args...)
+                @eval @inline $alt1_interp(i, j, k, grid::$GridType, scheme::LOADV, args...) = $interp(i, j, k, grid, scheme, args...)
             end
 
             outside_buffer = Symbol(:outside_, bias, :_halo, loc)
@@ -65,39 +66,24 @@ for bias in (:symmetric, :left_biased, :right_biased)
             # Conditional high-order interpolation in Bounded directions
             if ξ == :x
                 @eval begin
-                    @inline $alt_interp(i, j, k, grid::AUGX, scheme::HOADV, ψ) =
-                        ifelse($outside_buffer(i, grid.Nx, scheme),
-                               $interp(i, j, k, grid, scheme, ψ),
-                               $alt_interp(i, j, k, grid, scheme.buffer_scheme, ψ))
-
-                    @inline $alt_interp(i, j, k, grid::AUGX, scheme::HOADV, f::Function, args...) =
+                    @inline $alt1_interp(i, j, k, grid::AUGX, scheme::HOADV, args...) =
                         ifelse($outside_buffer(i, grid.Nx, scheme),
-                               $interp(i, j, k, grid, scheme, f, args...),
-                               $alt_interp(i, j, k, grid, scheme.buffer_scheme, f, args...))
+                               $interp(i, j, k, grid, scheme, args...),
+                               $alt2_interp(i, j, k, grid, scheme.buffer_scheme, args...))
                 end
             elseif ξ == :y
                 @eval begin
-                    @inline $alt_interp(i, j, k, grid::AUGY, scheme::HOADV, ψ) =
+                    @inline $alt1_interp(i, j, k, grid::AUGY, scheme::HOADV, args...) =
                         ifelse($outside_buffer(j, grid.Ny, scheme),
-                               $interp(i, j, k, grid, scheme, ψ),
-                               $alt_interp(i, j, k, grid, scheme.buffer_scheme, ψ))
-
-                    @inline $alt_interp(i, j, k, grid::AUGY, scheme::HOADV, f::Function, args...) =
-                        ifelse($outside_buffer(j, grid.Ny, scheme),
-                               $interp(i, j, k, grid, scheme, f, args...),
-                               $alt_interp(i, j, k, grid, scheme.buffer_scheme, f, args...))
+                               $interp(i, j, k, grid, scheme, args...),
+                               $alt2_interp(i, j, k, grid, scheme.buffer_scheme, args...))
                 end
             elseif ξ == :z
                 @eval begin
-                    @inline $alt_interp(i, j, k, grid::AUGZ, scheme::HOADV, ψ) =
-                        ifelse($outside_buffer(k, grid.Nz, scheme),
-                               $interp(i, j, k, grid, scheme, ψ),
-                               $alt_interp(i, j, k, grid, scheme.buffer_scheme, ψ))
-
-                    @inline $alt_interp(i, j, k, grid::AUGZ, scheme::HOADV, f::Function, args...) =
+                    @inline $alt1_interp(i, j, k, grid::AUGZ, scheme::HOADV, args...) =
                         ifelse($outside_buffer(k, grid.Nz, scheme),
-                               $interp(i, j, k, grid, scheme, f, args...),
-                               $alt_interp(i, j, k, grid, scheme.buffer_scheme, f, args...))
+                               $interp(i, j, k, grid, scheme, args...),
+                               $alt2_interp(i, j, k, grid, scheme.buffer_scheme, args...))
                 end
             end
         end
diff --git a/src/Advection/tracer_advection_operators.jl b/src/Advection/tracer_advection_operators.jl
index 86cc43229b..fcae1923ce 100644
--- a/src/Advection/tracer_advection_operators.jl
+++ b/src/Advection/tracer_advection_operators.jl
@@ -26,16 +26,55 @@ function TracerAdvection(x_advection, y_advection, z_advection)
     return TracerAdvection{H, FT}(x_advection, y_advection, z_advection)
 end
 
+Adapt.adapt_structure(to, scheme::TracerAdvection{N, FT}) where {N, FT} = 
+    TracerAdvection{N, FT}(Adapt.adapt(to, scheme.x),
+                           Adapt.adapt(to, scheme.y),
+                           Adapt.adapt(to, scheme.z))
+
 @inline _advective_tracer_flux_x(args...) = advective_tracer_flux_x(args...)
 @inline _advective_tracer_flux_y(args...) = advective_tracer_flux_y(args...)
 @inline _advective_tracer_flux_z(args...) = advective_tracer_flux_z(args...)
 
-@inline div_Uc(i, j, k, grid, advection, ::ZeroU, c) = zero(grid)
-@inline div_Uc(i, j, k, grid, advection, U, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_x(i, j, k, grid, advection::TracerAdvection, args...) =
+        _advective_tracer_flux_x(i, j, k, grid, advection.x, args...)
 
-@inline div_Uc(i, j, k, grid, ::Nothing, U, c) = zero(grid)
-@inline div_Uc(i, j, k, grid, ::Nothing, ::ZeroU, c) = zero(grid)
-@inline div_Uc(i, j, k, grid, ::Nothing, U, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_y(i, j, k, grid, advection::TracerAdvection, args...) =
+        _advective_tracer_flux_y(i, j, k, grid, advection.y, args...)
+
+@inline _advective_tracer_flux_z(i, j, k, grid, advection::TracerAdvection, args...) =
+        _advective_tracer_flux_z(i, j, k, grid, advection.z, args...)
+
+# Fallback for `nothing` advection
+@inline _advective_tracer_flux_x(i, j, k, grid, ::Nothing, args...) = zero(grid)
+@inline _advective_tracer_flux_y(i, j, k, grid, ::Nothing, args...) = zero(grid)
+@inline _advective_tracer_flux_z(i, j, k, grid, ::Nothing, args...) = zero(grid)
+
+# Fallback for `nothing` advection and `ZeroField` tracers and velocities
+@inline _advective_tracer_flux_x(i, j, k, grid, ::Nothing, ::ZeroField, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_y(i, j, k, grid, ::Nothing, ::ZeroField, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_z(i, j, k, grid, ::Nothing, ::ZeroField, ::ZeroField) = zero(grid)
+
+@inline _advective_tracer_flux_x(i, j, k, grid, ::Nothing, U, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_y(i, j, k, grid, ::Nothing, V, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_z(i, j, k, grid, ::Nothing, W, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_x(i, j, k, grid, ::Nothing, ::ZeroField, c) = zero(grid)
+@inline _advective_tracer_flux_y(i, j, k, grid, ::Nothing, ::ZeroField, c) = zero(grid)
+@inline _advective_tracer_flux_z(i, j, k, grid, ::Nothing, ::ZeroField, c) = zero(grid)
+
+# Fallback for `ZeroField` tracers and velocities
+@inline _advective_tracer_flux_x(i, j, k, grid, scheme, ::ZeroField, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_y(i, j, k, grid, scheme, ::ZeroField, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_z(i, j, k, grid, scheme, ::ZeroField, ::ZeroField) = zero(grid)
+
+# Fallback for `ZeroField` tracers
+@inline _advective_tracer_flux_x(i, j, k, grid, scheme, U, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_y(i, j, k, grid, scheme, V, ::ZeroField) = zero(grid)
+@inline _advective_tracer_flux_z(i, j, k, grid, scheme, W, ::ZeroField) = zero(grid)
+
+# Fallback for `ZeroField` velocities
+@inline _advective_tracer_flux_x(i, j, k, grid, scheme, ::ZeroField, c) = zero(grid)
+@inline _advective_tracer_flux_y(i, j, k, grid, scheme, ::ZeroField, c) = zero(grid)
+@inline _advective_tracer_flux_z(i, j, k, grid, scheme, ::ZeroField, c) = zero(grid)
 
 #####
 ##### Tracer advection operator
@@ -58,8 +97,10 @@ which ends up at the location `ccc`.
                                     δzᵃᵃᶜ(i, j, k, grid, _advective_tracer_flux_z, advection, U.w, c))
 end
 
-@inline function div_Uc(i, j, k, grid, advection::TracerAdvection, U, c)
-    return 1/Vᶜᶜᶜ(i, j, k, grid) * (δxᶜᵃᵃ(i, j, k, grid, _advective_tracer_flux_x, advection.x, U.u, c) +
-                                    δyᵃᶜᵃ(i, j, k, grid, _advective_tracer_flux_y, advection.y, U.v, c) +
-                                    δzᵃᵃᶜ(i, j, k, grid, _advective_tracer_flux_z, advection.z, U.w, c))
-end
+# Fallbacks for zero velocities, zero tracer and `nothing` advection
+@inline div_Uc(i, j, k, grid, advection, ::ZeroU, c) = zero(grid)
+@inline div_Uc(i, j, k, grid, advection, U, ::ZeroField) = zero(grid)
+
+@inline div_Uc(i, j, k, grid, ::Nothing, U, c) = zero(grid)
+@inline div_Uc(i, j, k, grid, ::Nothing, ::ZeroU, c) = zero(grid)
+@inline div_Uc(i, j, k, grid, ::Nothing, U, ::ZeroField) = zero(grid)
diff --git a/src/Advection/upwind_biased_advective_fluxes.jl b/src/Advection/upwind_biased_advective_fluxes.jl
index 701c266519..4d4a5b5960 100644
--- a/src/Advection/upwind_biased_advective_fluxes.jl
+++ b/src/Advection/upwind_biased_advective_fluxes.jl
@@ -15,85 +15,81 @@ const UpwindScheme = AbstractUpwindBiasedAdvectionScheme
 ##### Note the convention "advective_momentum_flux_AB" corresponds to the advection _of_ B _by_ A.
 #####
 
+struct LeftBias end
+struct RightBias end
+
+@inline bias(u::Number) = ifelse(u > 0, LeftBias(), RightBias())
+
 @inline function advective_momentum_flux_Uu(i, j, k, grid, scheme::UpwindScheme, U, u)
 
-    ũ  =    _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
-    uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, u)
-    uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, u)
+    ũ  = _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
+    uᴿ =    _biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, bias(ũ), u)
 
-    return upwind_biased_product(ũ, uᴸ, uᴿ)
+    return ũ * uᴿ
 end
 
 @inline function advective_momentum_flux_Vu(i, j, k, grid, scheme::UpwindScheme, V, u)
 
-    ṽ  =    _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
-    uᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, u)
-    uᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, u)
+    ṽ  = _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
+    uᴿ =    _biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, bias(ṽ), u)
 
-    return upwind_biased_product(ṽ, uᴸ, uᴿ)
+    return ṽ * uᴿ
 end
 
 @inline function advective_momentum_flux_Wu(i, j, k, grid, scheme::UpwindScheme, W, u)
 
-    w̃  =    _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
-    uᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, u)
-    uᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, u)
+    w̃  = _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
+    uᴿ =    _biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, bias(w̃), u)
 
-    return upwind_biased_product(w̃, uᴸ, uᴿ)
+    return w̃ * uᴿ
 end
 
 @inline function advective_momentum_flux_Uv(i, j, k, grid, scheme::UpwindScheme, U, v)
 
-    ũ  =    _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
-    vᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, v)
-    vᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, v)
+    ũ  = _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
+    vᴿ =    _biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, bias(ũ), v)
  
-    return upwind_biased_product(ũ, vᴸ, vᴿ)
+    return ũ * vᴿ
 end
 
 @inline function advective_momentum_flux_Vv(i, j, k, grid, scheme::UpwindScheme, V, v)
 
-    ṽ  =    _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
-    vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, v)
-    vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, v)
+    ṽ  = _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
+    vᴿ =    _biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, bias(ṽ), v)
 
-    return upwind_biased_product(ṽ, vᴸ, vᴿ)
+    return ṽ * vᴿ
 end
 
 @inline function advective_momentum_flux_Wv(i, j, k, grid, scheme::UpwindScheme, W, v)
 
-    w̃  =    _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
-    vᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, v)
-    vᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, v)
+    w̃  = _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
+    vᴿ =    _biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, bias(w̃), v)
 
-    return upwind_biased_product(w̃, vᴸ, vᴿ)
+    return w̃ * vᴿ
 end
 
 @inline function advective_momentum_flux_Uw(i, j, k, grid, scheme::UpwindScheme, U, w)
 
-    ũ  =    _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
-    wᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, w)
-    wᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, w)
+    ũ  = _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
+    wᴿ =    _biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, bias(ũ), w)
 
-    return upwind_biased_product(ũ, wᴸ, wᴿ)
+    return ũ * wᴿ
 end
 
 @inline function advective_momentum_flux_Vw(i, j, k, grid, scheme::UpwindScheme, V, w)
 
-    ṽ  =    _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
-    wᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, w)
-    wᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, w)
+    ṽ  = _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
+    wᴿ =    _biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, bias(ṽ), w)
 
-    return upwind_biased_product(ṽ, wᴸ, wᴿ)
+    return ṽ * wᴿ
 end
 
 @inline function advective_momentum_flux_Ww(i, j, k, grid, scheme::UpwindScheme, W, w)
 
-    w̃  =    _symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
-    wᴸ =  _left_biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, w)
-    wᴿ = _right_biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, w)
+    w̃  = _symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
+    wᴿ =    _biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, bias(w̃), w)
 
-    return upwind_biased_product(w̃, wᴸ, wᴿ)
+    return w̃ * wᴿ
 end
 
 #####
@@ -103,26 +99,23 @@ end
 @inline function advective_tracer_flux_x(i, j, k, grid, scheme::UpwindScheme, U, c) 
 
     @inbounds ũ = U[i, j, k]
-    cᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, c)
-    cᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, c)
+    cᴿ = _biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, bias(ũ), c)
 
-    return Axᶠᶜᶜ(i, j, k, grid) * upwind_biased_product(ũ, cᴸ, cᴿ)
+    return Axᶠᶜᶜ(i, j, k, grid) * ũ * cᴿ
 end
 
 @inline function advective_tracer_flux_y(i, j, k, grid, scheme::UpwindScheme, V, c)
 
     @inbounds ṽ = V[i, j, k]
-    cᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, c)
-    cᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, c)
+    cᴿ = _biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, bias(ṽ), c)
 
-    return Ayᶜᶠᶜ(i, j, k, grid) * upwind_biased_product(ṽ, cᴸ, cᴿ)
+    return Ayᶜᶠᶜ(i, j, k, grid) * ṽ * cᴿ
 end
 
 @inline function advective_tracer_flux_z(i, j, k, grid, scheme::UpwindScheme, W, c)
 
     @inbounds w̃ = W[i, j, k]
-    cᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, c)
-    cᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, c)
+    cᴿ = _biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, bias(w̃), c)
 
-    return Azᶜᶜᶠ(i, j, k, grid) * upwind_biased_product(w̃, cᴸ, cᴿ) 
+    return Azᶜᶜᶠ(i, j, k, grid) * w̃ * cᴿ
 end
diff --git a/src/Advection/upwind_biased_reconstruction.jl b/src/Advection/upwind_biased_reconstruction.jl
index e7b6955cc7..eb3c6805ec 100644
--- a/src/Advection/upwind_biased_reconstruction.jl
+++ b/src/Advection/upwind_biased_reconstruction.jl
@@ -110,34 +110,52 @@ const AUAS = AbstractUpwindBiasedAdvectionScheme
 @inline symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme::AUAS, v, args...) = @inbounds symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme.advecting_velocity_scheme, v, args...)
 @inline symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme::AUAS, w, args...) = @inbounds symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme.advecting_velocity_scheme, w, args...)
 
-# uniform upwind biased reconstruction
-for side in (:left, :right)
-    stencil_x = Symbol(:inner_, side, :_biased_interpolate_xᶠᵃᵃ)
-    stencil_y = Symbol(:inner_, side, :_biased_interpolate_yᵃᶠᵃ)
-    stencil_z = Symbol(:inner_, side, :_biased_interpolate_zᵃᵃᶠ)
+const UX{N, FT} = UpwindBiased{N, FT, <:Nothing} where {N, FT}
+const UY{N, FT} = UpwindBiased{N, FT, <:Any, <:Nothing} where {N, FT}
+const UZ{N, FT} = UpwindBiased{N, FT, <:Any, <:Any, <:Nothing} where {N, FT}
+
+# Uniform upwind biased reconstruction
+for buffer in advection_buffers
+    @eval begin
+        @inline inner_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, ::UX{$buffer, FT}, bias, ψ, idx, loc, args...) where FT = 
+            @inbounds ifelse(bias isa LeftBias, $(calc_reconstruction_stencil(buffer, :left,  :x, false)), 
+                                                $(calc_reconstruction_stencil(buffer, :right, :x, false)))
+
+        @inline inner_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, ::UX{$buffer, FT}, bias, ψ::Function, idx, loc, args...) where FT = 
+            @inbounds ifelse(bias isa LeftBias, $(calc_reconstruction_stencil(buffer, :left,  :x, true)), 
+                                                $(calc_reconstruction_stencil(buffer, :right, :x, true)))
+    
+        @inline inner_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, ::UY{$buffer, FT}, bias, ψ, idx, loc, args...) where FT = 
+            @inbounds ifelse(bias isa LeftBias, $(calc_reconstruction_stencil(buffer, :left,  :y, false)), 
+                                                $(calc_reconstruction_stencil(buffer, :right, :y, false)))
+                                                 
+        @inline inner_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, ::UY{$buffer, FT}, bias, ψ::Function, idx, loc, args...) where FT = 
+            @inbounds ifelse(bias isa LeftBias, $(calc_reconstruction_stencil(buffer, :left,  :y, true)), 
+                                                $(calc_reconstruction_stencil(buffer, :right, :y, true)))
+    
+        @inline inner_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, ::UZ{$buffer, FT}, bias, ψ, idx, loc, args...) where FT = 
+            @inbounds ifelse(bias isa LeftBias, $(calc_reconstruction_stencil(buffer, :left,  :z, false)), 
+                                                $(calc_reconstruction_stencil(buffer, :right, :z, false)))
 
-    for buffer in advection_buffers
-        @eval begin
-            @inline $stencil_x(i, j, k, grid, scheme::UpwindBiased{$buffer, FT, <:Nothing}, ψ, idx, loc, args...)           where FT = @inbounds $(calc_reconstruction_stencil(buffer, side, :x, false))
-            @inline $stencil_x(i, j, k, grid, scheme::UpwindBiased{$buffer, FT, <:Nothing}, ψ::Function, idx, loc, args...) where FT = @inbounds $(calc_reconstruction_stencil(buffer, side, :x,  true))
-        
-            @inline $stencil_y(i, j, k, grid, scheme::UpwindBiased{$buffer, FT, XT, <:Nothing}, ψ, idx, loc, args...)           where {FT, XT} = @inbounds $(calc_reconstruction_stencil(buffer, side, :y, false))
-            @inline $stencil_y(i, j, k, grid, scheme::UpwindBiased{$buffer, FT, XT, <:Nothing}, ψ::Function, idx, loc, args...) where {FT, XT} = @inbounds $(calc_reconstruction_stencil(buffer, side, :y,  true))
-        
-            @inline $stencil_z(i, j, k, grid, scheme::UpwindBiased{$buffer, FT, XT, YT, <:Nothing}, ψ, idx, loc, args...)           where {FT, XT, YT} = @inbounds $(calc_reconstruction_stencil(buffer, side, :z, false))
-            @inline $stencil_z(i, j, k, grid, scheme::UpwindBiased{$buffer, FT, XT, YT, <:Nothing}, ψ::Function, idx, loc, args...) where {FT, XT, YT} = @inbounds $(calc_reconstruction_stencil(buffer, side, :z,  true))
-        end
+        @inline inner_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, ::UZ{$buffer, FT}, bias, ψ::Function, idx, loc, args...) where FT = 
+            @inbounds ifelse(bias isa LeftBias, $(calc_reconstruction_stencil(buffer, :left,  :z, true)), 
+                                                $(calc_reconstruction_stencil(buffer, :right, :z, true)))                                          
     end
 end
 
-# stretched upwind biased reconstruction
-for (sd, side) in enumerate((:left, :right)), (dir, ξ, val) in zip((:xᶠᵃᵃ, :yᵃᶠᵃ, :zᵃᵃᶠ), (:x, :y, :z), (1, 2, 3))
-    stencil = Symbol(:inner_, side, :_biased_interpolate_, dir)
+# Stretched upwind reconstruction
+for (dir, ξ, val) in zip((:xᶠᵃᵃ, :yᵃᶠᵃ, :zᵃᵃᶠ), (:x, :y, :z), (1, 2, 3))
+    stencil = Symbol(:inner_biased_interpolate_, dir)
 
     for buffer in advection_buffers
         @eval begin
-            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer, FT}, ψ, idx, loc, args...)           where FT = @inbounds sum($(reconstruction_stencil(buffer, side, ξ, false)) .* retrieve_coeff(scheme, Val($sd), Val($val), idx, loc))
-            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer, FT}, ψ::Function, idx, loc, args...) where FT = @inbounds sum($(reconstruction_stencil(buffer, side, ξ,  true)) .* retrieve_coeff(scheme, Val($sd), Val($val), idx, loc))
+            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer, FT}, bias, ψ, idx, loc, args...) where FT = 
+                @inbounds ifelse(bias isa LeftBias, sum($(reconstruction_stencil(buffer, :left,  ξ, false)) .* retrieve_coeff(scheme, Val(1), Val($val), idx, loc)),
+                                                     sum($(reconstruction_stencil(buffer, :right, ξ, false)) .* retrieve_coeff(scheme, Val(2), Val($val), idx, loc)))
+
+            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer, FT}, bias, ψ::Function, idx, loc, args...) where FT = 
+                @inbounds ifelse(bias isa LeftBias, sum($(reconstruction_stencil(buffer, :left,  ξ, true)) .* retrieve_coeff(scheme, Val(1), Val($val), idx, loc)),
+                                                     sum($(reconstruction_stencil(buffer, :right, ξ, true)) .* retrieve_coeff(scheme, Val(2), Val($val), idx, loc)))
         end
     end
 end
diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 537dcf6eb3..a9a89b2623 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -147,7 +147,6 @@ const VectorInvariantKineticEnergyUpwinding     = VectorInvariant{<:Any, <:Any,
 #                                                 VectorInvariant{N,     FT,    M,     Z,     ZS,     V,     K,     D,                                     U (upwinding)
 const VectorInvariantCrossVerticalUpwinding     = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any,  <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:CrossAndSelfUpwinding}
 const VectorInvariantSelfVerticalUpwinding      = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any,  <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:OnlySelfUpwinding}
-const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any,  <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:VelocityUpwinding}
 
 Base.summary(a::VectorInvariant)                 = string("Vector Invariant, Dimension-by-dimension reconstruction")
 Base.summary(a::MultiDimensionalVectorInvariant) = string("Vector Invariant, Multidimensional reconstruction")
@@ -174,15 +173,16 @@ nothing_to_default(user_value; default) = isnothing(user_value) ? default : user
                           weno_kw...)
 
 """
-function WENOVectorInvariant(; upwinding = nothing,
-                               vorticity_stencil = VelocityStencil(),
-                               order = nothing,
-                               vorticity_order = nothing,
-                               vertical_order = nothing,
-                               divergence_order = nothing,
-                               kinetic_energy_gradient_order = nothing, 
-                               multi_dimensional_stencil = false,
-                               weno_kw...)
+function WENOVectorInvariant(FT::DataType = Float64; 
+                             upwinding = nothing,
+                             vorticity_stencil = VelocityStencil(),
+                             order = nothing,
+                             vorticity_order = nothing,
+                             vertical_order = nothing,
+                             divergence_order = nothing,
+                             kinetic_energy_gradient_order = nothing, 
+                             multi_dimensional_stencil = false,
+                             weno_kw...)
 
     if isnothing(order) # apply global defaults
         vorticity_order               = nothing_to_default(vorticity_order,  default = 9)
@@ -196,10 +196,10 @@ function WENOVectorInvariant(; upwinding = nothing,
         kinetic_energy_gradient_order = nothing_to_default(kinetic_energy_gradient_order, default = order)
     end
 
-    vorticity_scheme               = WENO(; order = vorticity_order, weno_kw...)
-    vertical_scheme                = WENO(; order = vertical_order, weno_kw...)
-    kinetic_energy_gradient_scheme = WENO(; order = kinetic_energy_gradient_order, weno_kw...)
-    divergence_scheme              = WENO(; order = divergence_order, weno_kw...)
+    vorticity_scheme               = WENO(FT; order=vorticity_order, weno_kw...)
+    vertical_scheme                = WENO(FT; order=vertical_order, weno_kw...)
+    kinetic_energy_gradient_scheme = WENO(FT; order=kinetic_energy_gradient_order, weno_kw...)
+    divergence_scheme              = WENO(FT; order=divergence_order, weno_kw...)
 
     default_upwinding = OnlySelfUpwinding(cross_scheme = divergence_scheme)
     upwinding = nothing_to_default(upwinding; default = default_upwinding)
@@ -246,7 +246,7 @@ on_architecture(to, scheme::VectorInvariant{N, FT, M}) where {N, FT, M} =
                                                                     bernoulli_head_V(i, j, k, grid, scheme, U.u, U.v)
 
 # Extend interpolate functions for VectorInvariant to allow MultiDimensional reconstruction
-for bias in (:_left_biased, :_right_biased, :_symmetric)
+for bias in (:_biased, :_symmetric)
     for (dir1, dir2) in zip((:xᶠᵃᵃ, :xᶜᵃᵃ, :yᵃᶠᵃ, :yᵃᶜᵃ), (:y, :y, :x, :x))
         interp_func = Symbol(bias, :_interpolate_, dir1)
         multidim_interp = Symbol(:_multi_dimensional_reconstruction_, dir2)
@@ -340,10 +340,9 @@ end
     Sζ = scheme.vorticity_stencil
 
     @inbounds v̂ = ℑxᶠᵃᵃ(i, j, k, grid, ℑyᵃᶜᵃ, Δx_qᶜᶠᶜ, v) / Δxᶠᶜᶜ(i, j, k, grid) 
-    ζᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
-    ζᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
+    ζᴿ = _biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, bias(v̂), ζ₃ᶠᶠᶜ, Sζ, u, v)
 
-    return - upwind_biased_product(v̂, ζᴸ, ζᴿ)
+    return - v̂ * ζᴿ
 end
 
 @inline function horizontal_advection_V(i, j, k, grid, scheme::VectorInvariantUpwindVorticity, u, v) 
@@ -351,10 +350,9 @@ end
     Sζ = scheme.vorticity_stencil
 
     @inbounds û = ℑyᵃᶠᵃ(i, j, k, grid, ℑxᶜᵃᵃ, Δy_qᶠᶜᶜ, u) / Δyᶜᶠᶜ(i, j, k, grid)
-    ζᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
-    ζᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
+    ζᴿ = _biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, bias(û), ζ₃ᶠᶠᶜ, Sζ, u, v)
 
-    return + upwind_biased_product(û, ζᴸ, ζᴿ)
+    return + û * ζᴿ
 end
 
 #####
@@ -399,10 +397,10 @@ end
 @inline U_dot_∇u(i, j, k, grid::AbstractGrid{FT}, scheme::Nothing, U) where FT = zero(FT)
 @inline U_dot_∇v(i, j, k, grid::AbstractGrid{FT}, scheme::Nothing, U) where FT = zero(FT)
 
-const U{N}  = UpwindBiased{N}
-const UX{N} = UpwindBiased{N, <:Any, <:Nothing} 
-const UY{N} = UpwindBiased{N, <:Any, <:Any, <:Nothing}
-const UZ{N} = UpwindBiased{N, <:Any, <:Any, <:Any, <:Nothing}
+const UB{N}  = UpwindBiased{N}
+const UBX{N} = UpwindBiased{N, <:Any, <:Nothing} 
+const UBY{N} = UpwindBiased{N, <:Any, <:Any, <:Nothing}
+const UBZ{N} = UpwindBiased{N, <:Any, <:Any, <:Any, <:Nothing}
 
 const C{N}  = Centered{N, <:Any}
 const CX{N} = Centered{N, <:Any, <:Nothing}
@@ -411,7 +409,7 @@ const CZ{N} = Centered{N, <:Any, <:Any, <:Any, <:Nothing}
 
 const AS = AbstractSmoothnessStencil
 
-# To adapt passing smoothness stencils to upwind biased schemes and centered schemes (not weno) 
+# To adapt passing smoothness stencils to upwind biased schemes and centered schemes (not WENO) 
 for b in 1:6
     @eval begin
         @inline inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, s::C{$b},  f::Function, idx, loc, ::AS, args...) = inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
@@ -421,18 +419,11 @@ for b in 1:6
         @inline inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, s::CY{$b}, f::Function, idx, loc, ::AS, args...) = inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
         @inline inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, s::CZ{$b}, f::Function, idx, loc, ::AS, args...) = inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
 
-        @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s::UX{$b}, f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s::UY{$b}, f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s::UZ{$b}, f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
-
-        @inline inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s::UX{$b}, f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s::UY{$b}, f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s::UZ{$b}, f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s::UB{$b},  bias, f::Function, idx, loc, ::AS, args...) = inner_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s, bias, f, idx, loc, args...)
+        @inline inner_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s::UB{$b},  bias, f::Function, idx, loc, ::AS, args...) = inner_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s, bias, f, idx, loc, args...)
+        @inline inner_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s::UB{$b},  bias, f::Function, idx, loc, ::AS, args...) = inner_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s, bias, f, idx, loc, args...)
+        @inline inner_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s::UBX{$b}, bias, f::Function, idx, loc, ::AS, args...) = inner_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s, bias, f, idx, loc, args...)
+        @inline inner_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s::UBY{$b}, bias, f::Function, idx, loc, ::AS, args...) = inner_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s, bias, f, idx, loc, args...)
+        @inline inner_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s::UBZ{$b}, bias, f::Function, idx, loc, ::AS, args...) = inner_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s, bias, f, idx, loc, args...)
     end
 end
diff --git a/src/Advection/vector_invariant_cross_upwinding.jl b/src/Advection/vector_invariant_cross_upwinding.jl
index 8ae27b8306..c3f2f88fd9 100644
--- a/src/Advection/vector_invariant_cross_upwinding.jl
+++ b/src/Advection/vector_invariant_cross_upwinding.jl
@@ -21,18 +21,16 @@
     @inbounds û = u[i, j, k]
     δ_stencil = scheme.upwinding.divergence_stencil
 
-    δᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
-    δᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
+    δᴿ = _biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, bias(û), flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
 
-    return upwind_biased_product(û, δᴸ, δᴿ)
+    return û * δᴿ
 end
 
 @inline function upwinded_divergence_flux_Vᶜᶠᶜ(i, j, k, grid, scheme::VectorInvariantCrossVerticalUpwinding, u, v)
     @inbounds v̂ = v[i, j, k]
     δ_stencil = scheme.upwinding.divergence_stencil
 
-    δᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
-    δᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
+    δᴿ = _biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, bias(v̂), flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
 
-    return upwind_biased_product(v̂, δᴸ, δᴿ) 
+    return v̂ * δᴿ
 end
diff --git a/src/Advection/vector_invariant_self_upwinding.jl b/src/Advection/vector_invariant_self_upwinding.jl
index d8f798006e..e8ee512cfd 100644
--- a/src/Advection/vector_invariant_self_upwinding.jl
+++ b/src/Advection/vector_invariant_self_upwinding.jl
@@ -18,11 +18,10 @@
     cross_scheme = scheme.upwinding.cross_scheme
 
     @inbounds û = u[i, j, k]
-    δvˢ =    _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_V, u, v) 
-    δuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, δx_U, δU_stencil, u, v) 
-    δuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, δx_U, δU_stencil, u, v) 
+    δvˢ = _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_V, u, v) 
+    δuᴿ =    _biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, bias(û), δx_U, δU_stencil, u, v) 
 
-    return upwind_biased_product(û, δuᴸ, δuᴿ) + û * δvˢ
+    return û * (δvˢ + δuᴿ)
 end
 
 @inline function upwinded_divergence_flux_Vᶜᶠᶜ(i, j, k, grid, scheme::VectorInvariantSelfVerticalUpwinding, u, v)
@@ -31,11 +30,10 @@ end
     cross_scheme = scheme.upwinding.cross_scheme
 
     @inbounds v̂ = v[i, j, k]
-    δuˢ =    _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, cross_scheme, δx_U, u, v)
-    δvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, δy_V, δV_stencil, u, v) 
-    δvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, δy_V, δV_stencil, u, v) 
+    δuˢ = _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, cross_scheme, δx_U, u, v)
+    δvᴿ =    _biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, bias(v̂), δy_V, δV_stencil, u, v) 
 
-    return upwind_biased_product(v̂, δvᴸ, δvᴿ) + v̂ * δuˢ
+    return v̂ * (δuˢ + δvᴿ)
 end
 
 #####
@@ -60,14 +58,10 @@ end
     δu²_stencil  = scheme.upwinding.δu²_stencil    
     cross_scheme = scheme.upwinding.cross_scheme
 
-    δKvˢ =    _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, cross_scheme, δx_v², u, v)
-    δKuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_gradient_scheme, δx_u², δu²_stencil, u, v)
-    δKuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_gradient_scheme, δx_u², δu²_stencil, u, v)
+    δKvˢ = _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, cross_scheme, δx_v², u, v)
+    δKuᴿ =    _biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_gradient_scheme, bias(û), δx_u², δu²_stencil, u, v)
     
-    ∂Kᴸ = (δKuᴸ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
-    ∂Kᴿ = (δKuᴿ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
-
-    return ifelse(û > 0, ∂Kᴸ, ∂Kᴿ)
+    return (δKuᴿ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
 end
 
 @inline function bernoulli_head_V(i, j, k, grid, scheme::VectorInvariantKineticEnergyUpwinding, u, v)
@@ -77,12 +71,8 @@ end
     δv²_stencil  = scheme.upwinding.δv²_stencil    
     cross_scheme = scheme.upwinding.cross_scheme
 
-    δKuˢ =    _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_u², u, v)
-    δKvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_gradient_scheme, δy_v², δv²_stencil, u, v) 
-    δKvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_gradient_scheme, δy_v², δv²_stencil, u, v) 
+    δKuˢ = _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_u², u, v)
+    δKvᴿ =    _biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_gradient_scheme, bias(v̂), δy_v², δv²_stencil, u, v) 
     
-    ∂Kᴸ = (δKvᴸ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid) 
-    ∂Kᴿ = (δKvᴿ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid)
-
-    return ifelse(v̂ > 0, ∂Kᴸ, ∂Kᴿ)
+    return (δKvᴿ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid)
 end
diff --git a/src/Advection/vector_invariant_upwinding.jl b/src/Advection/vector_invariant_upwinding.jl
index b2b24470a0..4949a453e2 100644
--- a/src/Advection/vector_invariant_upwinding.jl
+++ b/src/Advection/vector_invariant_upwinding.jl
@@ -88,25 +88,8 @@ CrossAndSelfUpwinding(; cross_scheme       = CenteredSecondOrder(),
                         δv²_stencil        = FunctionStencil(v_smoothness),
                         ) = CrossAndSelfUpwinding(extract_centered_scheme(cross_scheme), divergence_stencil, δu²_stencil, δv²_stencil)
 
-"""
-    VelocityUpwinding(; cross_scheme = CenteredSecondOrder()) 
-                                
-Upwinding treatment for Divergence fluxes and Kinetic Energy gradient in the Vector Invariant formulation, whereas only 
-the terms corresponding to the transporting velocity are upwinded. (i.e., terms in `u` in the zonal momentum equation and 
-terms in `v` in the meridional momentum equation). Contrarily to `OnlySelfUpwinding`, the reconstruction (and hence the
-upwinding) is done _inside_ the gradient operator, i.e., velocities are reconstructed instead of velocity derivatives.
-
-Keyword arguments
-=================  
-
-- `cross_scheme`: Advection scheme used for cross-reconstructed terms (tangential velocities) 
-                    in the kinetic energy gradient and the divergence flux. Defaults to `CenteredSecondOrder()`.
-"""
-VelocityUpwinding(; cross_scheme = CenteredSecondOrder()) = VelocityUpwinding(extract_centered_scheme(cross_scheme))
-                    
 Base.summary(a::OnlySelfUpwinding)     = "OnlySelfUpwinding"
 Base.summary(a::CrossAndSelfUpwinding) = "CrossAndSelfUpwinding"
-Base.summary(a::VelocityUpwinding)     = "VelocityUpwinding"
 
 Base.show(io::IO, a::OnlySelfUpwinding) =
     print(io, summary(a), " \n",
@@ -139,12 +122,3 @@ Adapt.adapt_structure(to, scheme::CrossAndSelfUpwinding) =
                           Adapt.adapt(to, scheme.divergence_stencil),
                           Adapt.adapt(to, scheme.δu²_stencil),
                           Adapt.adapt(to, scheme.δv²_stencil))
-
-Base.show(io::IO, a::VelocityUpwinding) =
-    print(io, summary(a), " \n",
-            "KE gradient and Divergence flux cross terms reconstruction: ", "\n",
-            "└── $(summary(a.cross_scheme))")
-
-Adapt.adapt_structure(to, scheme::VelocityUpwinding) = 
-    VelocityUpwinding(Adapt.adapt(to, scheme.cross_scheme))
-
diff --git a/src/Advection/vector_invariant_velocity_upwinding.jl b/src/Advection/vector_invariant_velocity_upwinding.jl
deleted file mode 100644
index 167cb735cf..0000000000
--- a/src/Advection/vector_invariant_velocity_upwinding.jl
+++ /dev/null
@@ -1,97 +0,0 @@
-#####
-##### Velocity upwinding is a Partial Upwinding where the upwind choice occurrs _inside_
-##### the difference operator (i.e., velocity upwinding) instead of outside (i.e., derivative upwinding).
-##### _MOST_ stable formulation at the expense of a low kinetic energy
-##### 
-
-##### 
-##### Velocity Upwinding of Divergence flux
-#####
-
-@inline function upwinded_Ax_uᶜᶜᶜ(i, j, k, grid, scheme, u) 
-    û = ℑxᶜᵃᵃ(i, j, k, grid, u)
-
-    Uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, Ax_qᶠᶜᶜ, u)
-    Uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, Ax_qᶠᶜᶜ, u)
-
-    return ifelse(û > 0, Uᴸ, Uᴿ)
-end
-
-@inline function upwinded_Ay_vᶜᶜᶜ(i, j, k, grid, scheme, v) 
-    v̂ = ℑyᵃᶜᵃ(i, j, k, grid, v)
-
-    Vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, Ay_qᶜᶠᶜ, v)
-    Vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, Ay_qᶜᶠᶜ, v)
-
-    return ifelse(v̂ > 0, Vᴸ, Vᴿ)
-end
-
-@inline reconstructed_Ax_uᶠᶠᶜ(i, j, k, grid, scheme, u) = 
-     _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.upwinding.cross_scheme, Ax_qᶠᶜᶜ, u)
-
-@inline reconstructed_Ay_vᶠᶠᶜ(i, j, k, grid, scheme, v) = 
-     _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.upwinding.cross_scheme, Ay_qᶜᶠᶜ, v)
-
-@inline function upwind_divergence_flux_Uᶠᶜᶜ(i, j, k, grid, scheme::VectorInvariantVelocityVerticalUpwinding, u, v) 
-    @inbounds û = u[i, j, k] 
-    
-    δu = δxᶠᶜᶜ(i, j, k, grid,      upwinded_Ax_uᶜᶜᶜ, scheme, u) 
-    δv = δyᶠᶜᶜ(i, j, k, grid, reconstructed_Ay_vᶠᶠᶜ, scheme, v)
-
-    return û * (δu + δv)
-end
-
-@inline function upwind_divergence_flux_Vᶜᶠᶜ(i, j, k, grid, scheme::VectorInvariantVelocityVerticalUpwinding, u, v) 
-    @inbounds v̂ = v[i, j, k] 
-
-    δu = δxᶜᵃᵃ(i, j, k, grid,      upwinded_Ax_uᶠᶠᶜ, scheme, u) 
-    δv = δyᵃᶠᵃ(i, j, k, grid, reconstructed_Ay_vᶜᶜᶜ, scheme, v)
-
-    return v̂ * (δu + δv)
-end
-
-##### 
-##### Velocity Upwinding of Kinetic Energy gradient
-#####
-
-const VectorInvariantVelocityKEGradientUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:VelocityUpwinding}
-
-@inline function upwinded_u²ᶜᶜᶜ(i, j, k, grid, scheme, u) 
-    û = ℑxᶜᵃᵃ(i, j, k, grid, u)
-
-    Uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_gradient_scheme, half_ϕ², u)
-    Uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_gradient_scheme, half_ϕ², u)
-
-    return ifelse(û > 0, Uᴸ, Uᴿ)
-end
-
-@inline function upwinded_v²ᶜᶜᶜ(i, j, k, grid, scheme, v) 
-    v̂ = ℑyᵃᶜᵃ(i, j, k, grid, v)
-
-    Vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_gradient_scheme, half_ϕ², v)
-    Vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_gradient_scheme, half_ϕ², v)
-
-    return ifelse(v̂ > 0, Vᴸ, Vᴿ)
-end
-
-@inline reconstructed_u²ᶜᶜᶜ(i, j, k, grid, scheme, u) =
-     _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.upwinding.cross_scheme, half_ϕ², u)
-
-@inline reconstructed_v²ᶜᶜᶜ(i, j, k, grid, scheme, v) = 
-     _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.upwinding.cross_scheme, half_ϕ², v)
-
-@inline function bernoulli_head_U(i, j, k, grid, scheme::VectorInvariantVelocityKEGradientUpwinding, u, v)
-
-    δKu = δxᶠᵃᵃ(i, j, k, grid,      upwinded_u²ᶜᶜᶜ, scheme, u)
-    δKv = δxᶠᵃᵃ(i, j, k, grid, reconstructed_v²ᶜᶜᶜ, scheme, v)
-
-    return (δKu + δKv) / Δxᶠᶜᶜ(i, j, k, grid)
-end
-
-@inline function bernoulli_head_V(i, j, k, grid, scheme::VectorInvariantVelocityKEGradientUpwinding, u, v)
-
-    δKu = δyᵃᶠᵃ(i, j, k, grid,      upwinded_u²ᶜᶜᶜ, scheme, u)
-    δKv = δyᵃᶠᵃ(i, j, k, grid, reconstructed_v²ᶜᶜᶜ, scheme, v)
-
-    return (δKu + δKv) / Δyᶜᶠᶜ(i, j, k, grid)
-end
diff --git a/src/Advection/weno_interpolants.jl b/src/Advection/weno_interpolants.jl
index 7c96b00adb..c135d29b5c 100644
--- a/src/Advection/weno_interpolants.jl
+++ b/src/Advection/weno_interpolants.jl
@@ -1,3 +1,5 @@
+using Oceananigans.Operators: ℑyᵃᶠᵃ, ℑxᶠᵃᵃ
+
 # WENO reconstruction of order `M` entails reconstructions of order `N`
 # on `N` different stencils, where `N = (M + 1) / 2`.
 #
@@ -22,12 +24,12 @@ calculation in the `VectorInvariant` advection formulation.
 
 Smoothness polynomials different from reconstructing polynomials can be specified _only_ for functional reconstructions:
 ```julia
-_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, reconstruced_function::F, smoothness_stencil, args...) where F<:Function
+_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, reconstruced_function::F, bias, smoothness_stencil, args...) where F<:Function
 ```
 
 For scalar reconstructions 
 ```julia
-_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, reconstruced_field::F) where F<:AbstractField
+_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, bias, reconstruced_field::F) where F<:AbstractField
 ```
 the smoothness is _always_ diagnosed from the reconstructing polynomials of `reconstructed_field`
 
@@ -68,32 +70,32 @@ Base.show(io::IO, a::FunctionStencil) =  print(io, "FunctionStencil f = $(a.func
 const ƞ = Int32(2) # WENO exponent
 const ε = 1e-8
 
-# Optimal values taken from
+# Optimal values for finite volume reconstruction of order `WENO{order}` and stencil `Val{stencil}` from
 # Balsara & Shu, "Monotonicity Preserving Weighted Essentially Non-oscillatory Schemes with Inceasingly High Order of Accuracy"
-@inline Cl(::WENO{2}, ::Val{0}) = 2/3
-@inline Cl(::WENO{2}, ::Val{1}) = 1/3
-
-@inline Cl(::WENO{3}, ::Val{0}) = 3/10
-@inline Cl(::WENO{3}, ::Val{1}) = 3/5
-@inline Cl(::WENO{3}, ::Val{2}) = 1/10
-
-@inline Cl(::WENO{4}, ::Val{0}) = 4/35
-@inline Cl(::WENO{4}, ::Val{1}) = 18/35
-@inline Cl(::WENO{4}, ::Val{2}) = 12/35
-@inline Cl(::WENO{4}, ::Val{3}) = 1/35
-
-@inline Cl(::WENO{5}, ::Val{0}) = 5/126
-@inline Cl(::WENO{5}, ::Val{1}) = 20/63
-@inline Cl(::WENO{5}, ::Val{2}) = 10/21
-@inline Cl(::WENO{5}, ::Val{3}) = 10/63
-@inline Cl(::WENO{5}, ::Val{4}) = 1/126
-
-@inline Cl(::WENO{6}, ::Val{0}) = 1/77
-@inline Cl(::WENO{6}, ::Val{1}) = 25/154
-@inline Cl(::WENO{6}, ::Val{2}) = 100/231
-@inline Cl(::WENO{6}, ::Val{3}) = 25/77
-@inline Cl(::WENO{6}, ::Val{4}) = 5/77
-@inline Cl(::WENO{6}, ::Val{5}) = 1/462
+@inline C★(::WENO{2}, ::Val{0}) = 2/3
+@inline C★(::WENO{2}, ::Val{1}) = 1/3
+
+@inline C★(::WENO{3}, ::Val{0}) = 3/10
+@inline C★(::WENO{3}, ::Val{1}) = 3/5
+@inline C★(::WENO{3}, ::Val{2}) = 1/10
+
+@inline C★(::WENO{4}, ::Val{0}) = 4/35
+@inline C★(::WENO{4}, ::Val{1}) = 18/35
+@inline C★(::WENO{4}, ::Val{2}) = 12/35
+@inline C★(::WENO{4}, ::Val{3}) = 1/35
+
+@inline C★(::WENO{5}, ::Val{0}) = 5/126
+@inline C★(::WENO{5}, ::Val{1}) = 20/63
+@inline C★(::WENO{5}, ::Val{2}) = 10/21
+@inline C★(::WENO{5}, ::Val{3}) = 10/63
+@inline C★(::WENO{5}, ::Val{4}) = 1/126
+
+@inline C★(::WENO{6}, ::Val{0}) = 1/77
+@inline C★(::WENO{6}, ::Val{1}) = 25/154
+@inline C★(::WENO{6}, ::Val{2}) = 100/231
+@inline C★(::WENO{6}, ::Val{3}) = 25/77
+@inline C★(::WENO{6}, ::Val{4}) = 5/77
+@inline C★(::WENO{6}, ::Val{5}) = 1/462
 
 # ENO reconstruction procedure per stencil 
 for buffer in [2, 3, 4, 5, 6]
@@ -101,122 +103,192 @@ for buffer in [2, 3, 4, 5, 6]
 
         # ENO coefficients for uniform direction (when T<:Nothing) and stretched directions (when T<:Any) 
         @eval begin
-            @inline Cr(scheme::WENO{$buffer}, ::Val{$stencil}) = @inbounds Cl(scheme, Val($(buffer-stencil-1)))
+            """
+                coeff_p(::WENO{buffer, FT}, bias, ::Val{stencil}, T, args...) 
 
-            # uniform coefficients are independent on direction and location
-            @inline  coeff_left_p(scheme::WENO{$buffer, FT}, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = @inbounds FT.($(stencil_coefficients(50, stencil  , collect(1:100), collect(1:100); order = buffer)))
-            @inline coeff_right_p(scheme::WENO{$buffer, FT}, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = @inbounds FT.($(stencil_coefficients(50, stencil-1, collect(1:100), collect(1:100); order = buffer)))
+            Reconstruction coefficients for the stencil number `stencil` of a WENO reconstruction 
+            of order `buffer * 2 - 1`. Uniform coefficients (i.e. when `T == Nothing`) are independent on the
+            `bias` of the reconstruction (either `LeftBias` or `RightBias`), while stretched coeffiecients are
+            retrieved from the precomputed coefficients via the `retrieve_coeff` function
+            """
+            @inline coeff_p(::WENO{$buffer, FT}, bias, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = 
+                @inbounds map(FT, $(stencil_coefficients(50, stencil, collect(1:100), collect(1:100); order = buffer)))
 
             # stretched coefficients are retrieved from precalculated coefficients
-            @inline  coeff_left_p(scheme::WENO{$buffer}, ::Val{$stencil}, T, dir, i, loc) = @inbounds retrieve_coeff(scheme, $stencil,     dir, i, loc)
-            @inline coeff_right_p(scheme::WENO{$buffer}, ::Val{$stencil}, T, dir, i, loc) = @inbounds retrieve_coeff(scheme, $(stencil-1), dir, i, loc)
+            @inline coeff_p(scheme::WENO{$buffer}, bias, ::Val{$stencil}, T, dir, i, loc) = 
+                ifelse(bias isa LeftBias, retrieve_coeff(scheme, $stencil, dir, i, loc),
+                                  reverse(retrieve_coeff(scheme, $(buffer - 2 - stencil), dir, i, loc)))
         end
     
         # left biased and right biased reconstruction value for each stencil
         @eval begin
-            @inline  left_biased_p(scheme::WENO{$buffer}, ::Val{$stencil}, ψ, T, dir, i, loc) = @inbounds  sum(coeff_left_p(scheme, Val($stencil), T, dir, i, loc) .* ψ)
-            @inline right_biased_p(scheme::WENO{$buffer}, ::Val{$stencil}, ψ, T, dir, i, loc) = @inbounds sum(coeff_right_p(scheme, Val($stencil), T, dir, i, loc) .* ψ)
+            """ 
+                biased_p(scheme::WENO{buffer}, bias, ::Val{stencil}, ψ, T, dir, i, loc)
+
+            Biased reconstruction of `ψ` from the stencil `stencil` of a WENO reconstruction of
+            order `buffer * 2 - 1`. The reconstruction is calculated as
+            
+            ```math
+            ψ★ = ∑ᵣ cᵣ ⋅ ψᵣ
+            ```
+
+            where ``cᵣ`` is computed from the function `coeff_p`
+            """
+            @inline biased_p(scheme::WENO{$buffer}, bias, ::Val{$stencil}, ψ, T, dir, i, loc) = 
+                @inbounds sum(coeff_p(scheme, bias, Val($stencil), T, dir, i, loc) .* ψ)
         end
     end
 end
 
 # _UNIFORM_ smoothness coefficients (stretched smoothness coefficients are to be fixed!)
-@inline smoothness_coefficients(scheme::WENO{2, FT}, ::Val{0}) where FT = @inbounds FT.((1, -2, 1))
-@inline smoothness_coefficients(scheme::WENO{2, FT}, ::Val{1}) where FT = @inbounds FT.((1, -2, 1))
-
-@inline smoothness_coefficients(scheme::WENO{3, FT}, ::Val{0}) where FT = @inbounds FT.((10, -31, 11, 25, -19,  4))
-@inline smoothness_coefficients(scheme::WENO{3, FT}, ::Val{1}) where FT = @inbounds FT.((4,  -13, 5,  13, -13,  4))
-@inline smoothness_coefficients(scheme::WENO{3, FT}, ::Val{2}) where FT = @inbounds FT.((4,  -19, 11, 25, -31, 10))
-
-@inline smoothness_coefficients(scheme::WENO{4, FT}, ::Val{0}) where FT = @inbounds FT.((2.107,  -9.402, 7.042, -1.854, 11.003,  -17.246,  4.642,  7.043,  -3.882, 0.547))
-@inline smoothness_coefficients(scheme::WENO{4, FT}, ::Val{1}) where FT = @inbounds FT.((0.547,  -2.522, 1.922, -0.494,  3.443,  - 5.966,  1.602,  2.843,  -1.642, 0.267))
-@inline smoothness_coefficients(scheme::WENO{4, FT}, ::Val{2}) where FT = @inbounds FT.((0.267,  -1.642, 1.602, -0.494,  2.843,  - 5.966,  1.922,  3.443,  -2.522, 0.547))
-@inline smoothness_coefficients(scheme::WENO{4, FT}, ::Val{3}) where FT = @inbounds FT.((0.547,  -3.882, 4.642, -1.854,  7.043,  -17.246,  7.042, 11.003,  -9.402, 2.107))
-
-@inline smoothness_coefficients(scheme::WENO{5, FT}, ::Val{0}) where FT = @inbounds FT.((1.07918,  -6.49501, 7.58823, -4.11487,  0.86329,  10.20563, -24.62076, 13.58458, -2.88007, 15.21393, -17.04396, 3.64863,  4.82963, -2.08501, 0.22658)) 
-@inline smoothness_coefficients(scheme::WENO{5, FT}, ::Val{1}) where FT = @inbounds FT.((0.22658,  -1.40251, 1.65153, -0.88297,  0.18079,   2.42723,  -6.11976,  3.37018, -0.70237,  4.06293,  -4.64976, 0.99213,  1.38563, -0.60871, 0.06908)) 
-@inline smoothness_coefficients(scheme::WENO{5, FT}, ::Val{2}) where FT = @inbounds FT.((0.06908,  -0.51001, 0.67923, -0.38947,  0.08209,   1.04963,  -2.99076,  1.79098, -0.38947,  2.31153,  -2.99076, 0.67923,  1.04963, -0.51001, 0.06908)) 
-@inline smoothness_coefficients(scheme::WENO{5, FT}, ::Val{3}) where FT = @inbounds FT.((0.06908,  -0.60871, 0.99213, -0.70237,  0.18079,   1.38563,  -4.64976,  3.37018, -0.88297,  4.06293,  -6.11976, 1.65153,  2.42723, -1.40251, 0.22658)) 
-@inline smoothness_coefficients(scheme::WENO{5, FT}, ::Val{4}) where FT = @inbounds FT.((0.22658,  -2.08501, 3.64863, -2.88007,  0.86329,   4.82963, -17.04396, 13.58458, -4.11487, 15.21393, -24.62076, 7.58823, 10.20563, -6.49501, 1.07918)) 
-
-@inline smoothness_coefficients(scheme::WENO{6, FT}, ::Val{0}) where FT = @inbounds FT.((0.6150211, -4.7460464, 7.6206736, -6.3394124, 2.7060170, -0.4712740,  9.4851237, -31.1771244, 26.2901672, -11.3206788,  1.9834350, 26.0445372, -44.4003904, 19.2596472, -3.3918804, 19.0757572, -16.6461044, 2.9442256, 3.6480687, -1.2950184, 0.1152561)) 
-@inline smoothness_coefficients(scheme::WENO{6, FT}, ::Val{1}) where FT = @inbounds FT.((0.1152561, -0.9117992, 1.4742480, -1.2183636, 0.5134574, -0.0880548,  1.9365967,  -6.5224244,  5.5053752,  -2.3510468,  0.4067018,  5.6662212,  -9.7838784,  4.2405032, -0.7408908,  4.3093692,  -3.7913324, 0.6694608, 0.8449957, -0.3015728, 0.0271779)) 
-@inline smoothness_coefficients(scheme::WENO{6, FT}, ::Val{2}) where FT = @inbounds FT.((0.0271779, -0.2380800, 0.4086352, -0.3462252, 0.1458762, -0.0245620,  0.5653317,  -2.0427884,  1.7905032,  -0.7727988,  0.1325006,  1.9510972,  -3.5817664,  1.5929912, -0.2792660,  1.7195652,  -1.5880404, 0.2863984, 0.3824847, -0.1429976, 0.0139633)) 
-@inline smoothness_coefficients(scheme::WENO{6, FT}, ::Val{3}) where FT = @inbounds FT.((0.0139633, -0.1429976, 0.2863984, -0.2792660, 0.1325006, -0.0245620,  0.3824847,  -1.5880404,  1.5929912,  -0.7727988,  0.1458762,  1.7195652,  -3.5817664,  1.7905032, -0.3462252,  1.9510972,  -2.0427884, 0.4086352, 0.5653317, -0.2380800, 0.0271779)) 
-@inline smoothness_coefficients(scheme::WENO{6, FT}, ::Val{4}) where FT = @inbounds FT.((0.0271779, -0.3015728, 0.6694608, -0.7408908, 0.4067018, -0.0880548,  0.8449957,  -3.7913324,  4.2405032,  -2.3510468,  0.5134574,  4.3093692,  -9.7838784,  5.5053752, -1.2183636,  5.6662212,  -6.5224244, 1.4742480, 1.9365967, -0.9117992, 0.1152561)) 
-@inline smoothness_coefficients(scheme::WENO{6, FT}, ::Val{5}) where FT = @inbounds FT.((0.1152561, -1.2950184, 2.9442256, -3.3918804, 1.9834350, -0.4712740,  3.6480687, -16.6461044, 19.2596472, -11.3206788,  2.7060170, 19.0757572, -44.4003904, 26.2901672, -6.3394124, 26.0445372, -31.1771244, 7.6206736, 9.4851237, -4.7460464, 0.6150211)) 
+
+"""
+    smoothness_coefficients(::Val{buffer}, ::Val{stencil})
+
+Return the coefficients used to calculate the smoothness indicators for the stencil 
+number `stencil` of a WENO reconstruction of order `buffer * 2 - 1`. The coefficients
+are ordered in such a way to calculate the smoothness in the following fashion:
+
+```julia
+buffer  = 4
+stencil = 0
+
+ψ = # The stencil corresponding to S₀ with buffer 4 (7th order WENO)
+
+C = smoothness_coefficients(Val(buffer), Val(0))
+
+# The smoothness indicator
+β = ψ[1] * (C[1]  * ψ[1] + C[2] * ψ[2] + C[3] * ψ[3] + C[4] * ψ[4]) + 
+    ψ[2] * (C[5]  * ψ[2] + C[6] * ψ[3] + C[7] * ψ[4]) + 
+    ψ[3] * (C[8]  * ψ[3] + C[9] * ψ[4])
+    ψ[4] * (C[10] * ψ[4])
+```
+
+This last operation is metaprogrammed in the function `metaprogrammed_smoothness_operation`
+"""
+@inline smoothness_coefficients(::Val{2}, ::Val{0}) = :((1, -2, 1))
+@inline smoothness_coefficients(::Val{2}, ::Val{1}) = :((1, -2, 1))
+
+@inline smoothness_coefficients(::Val{3}, ::Val{0}) = :((10, -31, 11, 25, -19,  4))
+@inline smoothness_coefficients(::Val{3}, ::Val{1}) = :((4,  -13, 5,  13, -13,  4))
+@inline smoothness_coefficients(::Val{3}, ::Val{2}) = :((4,  -19, 11, 25, -31, 10))
+
+@inline smoothness_coefficients(::Val{4}, ::Val{0}) = :((2.107,  -9.402, 7.042, -1.854, 11.003,  -17.246,  4.642,  7.043,  -3.882, 0.547))
+@inline smoothness_coefficients(::Val{4}, ::Val{1}) = :((0.547,  -2.522, 1.922, -0.494,  3.443,  - 5.966,  1.602,  2.843,  -1.642, 0.267))
+@inline smoothness_coefficients(::Val{4}, ::Val{2}) = :((0.267,  -1.642, 1.602, -0.494,  2.843,  - 5.966,  1.922,  3.443,  -2.522, 0.547))
+@inline smoothness_coefficients(::Val{4}, ::Val{3}) = :((0.547,  -3.882, 4.642, -1.854,  7.043,  -17.246,  7.042, 11.003,  -9.402, 2.107))
+
+@inline smoothness_coefficients(::Val{5}, ::Val{0}) = :((1.07918,  -6.49501, 7.58823, -4.11487,  0.86329,  10.20563, -24.62076, 13.58458, -2.88007, 15.21393, -17.04396, 3.64863,  4.82963, -2.08501, 0.22658))
+@inline smoothness_coefficients(::Val{5}, ::Val{1}) = :((0.22658,  -1.40251, 1.65153, -0.88297,  0.18079,   2.42723,  -6.11976,  3.37018, -0.70237,  4.06293,  -4.64976, 0.99213,  1.38563, -0.60871, 0.06908))
+@inline smoothness_coefficients(::Val{5}, ::Val{2}) = :((0.06908,  -0.51001, 0.67923, -0.38947,  0.08209,   1.04963,  -2.99076,  1.79098, -0.38947,  2.31153,  -2.99076, 0.67923,  1.04963, -0.51001, 0.06908))
+@inline smoothness_coefficients(::Val{5}, ::Val{3}) = :((0.06908,  -0.60871, 0.99213, -0.70237,  0.18079,   1.38563,  -4.64976,  3.37018, -0.88297,  4.06293,  -6.11976, 1.65153,  2.42723, -1.40251, 0.22658))
+@inline smoothness_coefficients(::Val{5}, ::Val{4}) = :((0.22658,  -2.08501, 3.64863, -2.88007,  0.86329,   4.82963, -17.04396, 13.58458, -4.11487, 15.21393, -24.62076, 7.58823, 10.20563, -6.49501, 1.07918))
+
+@inline smoothness_coefficients(::Val{6}, ::Val{0}) = :((0.6150211, -4.7460464, 7.6206736, -6.3394124, 2.7060170, -0.4712740,  9.4851237, -31.1771244, 26.2901672, -11.3206788,  1.9834350, 26.0445372, -44.4003904, 19.2596472, -3.3918804, 19.0757572, -16.6461044, 2.9442256, 3.6480687, -1.2950184, 0.1152561))
+@inline smoothness_coefficients(::Val{6}, ::Val{1}) = :((0.1152561, -0.9117992, 1.4742480, -1.2183636, 0.5134574, -0.0880548,  1.9365967,  -6.5224244,  5.5053752,  -2.3510468,  0.4067018,  5.6662212,  -9.7838784,  4.2405032, -0.7408908,  4.3093692,  -3.7913324, 0.6694608, 0.8449957, -0.3015728, 0.0271779))
+@inline smoothness_coefficients(::Val{6}, ::Val{2}) = :((0.0271779, -0.2380800, 0.4086352, -0.3462252, 0.1458762, -0.0245620,  0.5653317,  -2.0427884,  1.7905032,  -0.7727988,  0.1325006,  1.9510972,  -3.5817664,  1.5929912, -0.2792660,  1.7195652,  -1.5880404, 0.2863984, 0.3824847, -0.1429976, 0.0139633))
+@inline smoothness_coefficients(::Val{6}, ::Val{3}) = :((0.0139633, -0.1429976, 0.2863984, -0.2792660, 0.1325006, -0.0245620,  0.3824847,  -1.5880404,  1.5929912,  -0.7727988,  0.1458762,  1.7195652,  -3.5817664,  1.7905032, -0.3462252,  1.9510972,  -2.0427884, 0.4086352, 0.5653317, -0.2380800, 0.0271779))
+@inline smoothness_coefficients(::Val{6}, ::Val{4}) = :((0.0271779, -0.3015728, 0.6694608, -0.7408908, 0.4067018, -0.0880548,  0.8449957,  -3.7913324,  4.2405032,  -2.3510468,  0.5134574,  4.3093692,  -9.7838784,  5.5053752, -1.2183636,  5.6662212,  -6.5224244, 1.4742480, 1.9365967, -0.9117992, 0.1152561))
+@inline smoothness_coefficients(::Val{6}, ::Val{5}) = :((0.1152561, -1.2950184, 2.9442256, -3.3918804, 1.9834350, -0.4712740,  3.6480687, -16.6461044, 19.2596472, -11.3206788,  2.7060170, 19.0757572, -44.4003904, 26.2901672, -6.3394124, 26.0445372, -31.1771244, 7.6206736, 9.4851237, -4.7460464, 0.6150211))
 
 # The rule for calculating smoothness indicators is the following (example WENO{4} which is seventh order) 
 # ψ[1] (C[1]  * ψ[1] + C[2] * ψ[2] + C[3] * ψ[3] + C[4] * ψ[4]) + 
 # ψ[2] (C[5]  * ψ[2] + C[6] * ψ[3] + C[7] * ψ[4]) + 
 # ψ[3] (C[8]  * ψ[3] + C[9] * ψ[4])
 # ψ[4] (C[10] * ψ[4])
-# This expression is the output of metaprogrammed_smoothness_sum(4)
+# This expression is the output of metaprogrammed_smoothness_operation(4)
 
 # Trick to force compilation of Val(stencil-1) and avoid loops on the GPU
-@inline function metaprogrammed_smoothness_sum(buffer)
+@inline function metaprogrammed_smoothness_operation(buffer)
     elem = Vector(undef, buffer)
     c_idx = 1
     for stencil = 1:buffer - 1
-        stencil_sum   = Expr(:call, :+, (:(@inbounds C[$(c_idx + i - stencil)] * ψ[$i]) for i in stencil:buffer)...)
-        elem[stencil] = :(@inbounds ψ[$stencil] * $stencil_sum)
+        stencil_sum   = Expr(:call, :+, (:(C[$(c_idx + i - stencil)] * ψ[$i]) for i in stencil:buffer)...)
+        elem[stencil] = :(ψ[$stencil] * $stencil_sum)
         c_idx += buffer - stencil + 1
     end
 
-    elem[buffer] = :(@inbounds ψ[$buffer] * ψ[$buffer] * C[$c_idx])
+    elem[buffer] = :(ψ[$buffer] * ψ[$buffer] * C[$c_idx])
     
     return Expr(:call, :+, elem...)
 end
 
+"""
+    smoothness_indicator(ψ, scheme::WENO{buffer, FT}, ::Val{stencil})
+
+Return the smoothness indicator β for the stencil number `stencil` of a WENO reconstruction of order `buffer * 2 - 1`.
+The smoothness indicator (β) is calculated as follows
+
+```julia
+C = smoothness_coefficients(Val(buffer), Val(stencil))
+
+# The smoothness indicator
+β = 0
+c_idx = 1
+for stencil = 1:buffer - 1
+    partial_sum = [C[c_idx + i - stencil)] * ψ[i]) for i in stencil:buffer]
+    β          += ψ[stencil] * partial_sum
+    c_idx += buffer - stencil + 1
+end
+
+β += ψ[buffer] * ψ[buffer] * C[c_idx])
+```
+
+This last operation is metaprogrammed in the function `metaprogrammed_smoothness_operation` (to avoid loops)
+and, for `buffer == 3` unrolls into
+
+```julia
+β = ψ[1] * (C[1]  * ψ[1] + C[2] * ψ[2] + C[3] * ψ[3]) + 
+    ψ[2] * (C[4]  * ψ[2] + C[5] * ψ[3]) + 
+    ψ[3] * (C[6])
+```
+
+while for `buffer == 4` unrolls into
+
+```julia
+β = ψ[1] * (C[1]  * ψ[1] + C[2] * ψ[2] + C[3] * ψ[3] + C[4] * ψ[4]) + 
+    ψ[2] * (C[5]  * ψ[2] + C[6] * ψ[3] + C[7] * ψ[4]) + 
+    ψ[3] * (C[8]  * ψ[3] + C[9] * ψ[4])
+    ψ[4] * (C[10] * ψ[4])
+```
+"""
+@inline smoothness_indicator(ψ, args...) = zero(ψ[1]) # This is a fallback method, here only for documentation purposes
+
 # Smoothness indicators for stencil `stencil` for left and right biased reconstruction
 for buffer in [2, 3, 4, 5, 6]
-    @eval begin
-        @inline smoothness_sum(scheme::WENO{$buffer}, ψ, C) = @inbounds $(metaprogrammed_smoothness_sum(buffer))
-    end
-
-    for stencil in [0, 1, 2, 3, 4, 5]
-        @eval begin
-            @inline  left_biased_β(ψ, scheme::WENO{$buffer}, ::Val{$stencil}) = @inbounds smoothness_sum(scheme, ψ, smoothness_coefficients(scheme, Val($stencil)))
-            @inline right_biased_β(ψ, scheme::WENO{$buffer}, ::Val{$stencil}) = @inbounds smoothness_sum(scheme, ψ, smoothness_coefficients(scheme, Val($stencil)))
-        end
+    @eval @inline smoothness_operation(scheme::WENO{$buffer}, ψ, C) = @inbounds $(metaprogrammed_smoothness_operation(buffer))
+    
+    for stencil in 0:buffer-1
+        @eval @inline smoothness_indicator(ψ, scheme::WENO{$buffer, FT}, ::Val{$stencil}) where FT = 
+                      smoothness_operation(scheme, ψ, map(FT, $(smoothness_coefficients(Val(buffer), Val(stencil)))))
     end
 end
 
-# Shenanigans for WENO weights calculation for vector invariant formulation -> [β[i] = 0.5*(βᵤ[i] + βᵥ[i]) for i in 1:buffer]
+# Shenanigans for WENO weights calculation for vector invariant formulation -> [β[i] = 0.5 * (βᵤ[i] + βᵥ[i]) for i in 1:buffer]
 @inline function metaprogrammed_beta_sum(buffer)
     elem = Vector(undef, buffer)
     for stencil = 1:buffer
-        elem[stencil] = :(@inbounds (β₁[$stencil] + β₂[$stencil])/2)
+        elem[stencil] = :((β₁[$stencil] + β₂[$stencil])/2)
     end
 
     return :($(elem...),)
 end
 
-# left and right biased_β calculation for scheme and stencil = 0:buffer - 1
+# smoothness_indicator calculation for scheme and stencil = 0:buffer - 1
 @inline function metaprogrammed_beta_loop(buffer)
     elem = Vector(undef, buffer)
     for stencil = 1:buffer
-        elem[stencil] = :(@inbounds func(ψ[$stencil], scheme, Val($(stencil-1))))
+        elem[stencil] = :(smoothness_indicator(ψ[$stencil], scheme, Val($(stencil-1))))
     end
 
     return :($(elem...),)
 end
 
-# ZWENO α weights dᵣ * (1 + (τ₂ᵣ₋₁ / (βᵣ + ε))ᵖ)
+# ZWENO α weights C★ᵣ * (1 + (τ₂ᵣ₋₁ / (βᵣ + ε))ᵖ)
 @inline function metaprogrammed_zweno_alpha_loop(buffer)
     elem = Vector(undef, buffer)
     for stencil = 1:buffer
-        elem[stencil] = :(@inbounds FT(coeff(scheme, Val($(stencil-1)))) * (1 + (τ / (β[$stencil] + FT(ε)))^ƞ))
-    end
-
-    return :($(elem...),)
-end
-
-# JSWENO α weights dᵣ / (βᵣ + ε)²
-@inline function metaprogrammed_js_alpha_loop(buffer)
-    elem = Vector(undef, buffer)
-    for stencil = 1:buffer
-        elem[stencil] = :(@inbounds FT(coeff(scheme, Val($(stencil-1)))) / (β[$stencil] + FT(ε))^ƞ)
+        elem[stencil] = :(convert(FT, C★(scheme, Val($(stencil-1)))) * (1 + (τ / (β[$stencil] + FT(ε)))^ƞ))
     end
 
     return :($(elem...),)
@@ -224,70 +296,63 @@ end
 
 for buffer in [2, 3, 4, 5, 6]
     @eval begin
-        @inline         beta_sum(scheme::WENO{$buffer}, β₁, β₂)           = @inbounds $(metaprogrammed_beta_sum(buffer))
-        @inline        beta_loop(scheme::WENO{$buffer}, ψ, func)          = @inbounds $(metaprogrammed_beta_loop(buffer))
-        @inline zweno_alpha_loop(scheme::WENO{$buffer}, β, τ, coeff, FT)  = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer))
-        @inline    js_alpha_loop(scheme::WENO{$buffer}, β, coeff, FT)     = @inbounds $(metaprogrammed_js_alpha_loop(buffer))
+        @inline         beta_sum(scheme::WENO{$buffer, FT}, β₁, β₂) where FT = @inbounds $(metaprogrammed_beta_sum(buffer))
+        @inline        beta_loop(scheme::WENO{$buffer, FT}, ψ)      where FT = @inbounds $(metaprogrammed_beta_loop(buffer))
+        @inline zweno_alpha_loop(scheme::WENO{$buffer, FT}, β, τ)   where FT = @inbounds $(metaprogrammed_zweno_alpha_loop(buffer))
     end
 end
 
-# Global smoothness indicator τ₂ᵣ₋₁ taken from "Accuracy of the weighted essentially non-oscillatory conservative finite difference schemes", Don & Borges, 2013
+# Global smoothness indicator τ₂ᵣ₋₁ from "Accuracy of the weighted essentially non-oscillatory conservative finite difference schemes", Don & Borges, 2013
 @inline global_smoothness_indicator(::Val{2}, β) = @inbounds abs(β[1] - β[2])
 @inline global_smoothness_indicator(::Val{3}, β) = @inbounds abs(β[1] - β[3])
 @inline global_smoothness_indicator(::Val{4}, β) = @inbounds abs(β[1] +  3β[2] -   3β[3] -    β[4])
 @inline global_smoothness_indicator(::Val{5}, β) = @inbounds abs(β[1] +  2β[2] -   6β[3] +   2β[4] + β[5])
 @inline global_smoothness_indicator(::Val{6}, β) = @inbounds abs(β[1] + 36β[2] + 135β[3] - 135β[4] - 36β[5] - β[6])
 
-# Calculating Dynamic WENO Weights (wᵣ), either with JS weno, Z weno or VectorInvariant WENO
-for (side, coeff) in zip([:left, :right], (:Cl, :Cr))
-    biased_weno_weights = Symbol(side, :_biased_weno_weights)
-    biased_β = Symbol(side, :_biased_β)
-    
-    tangential_stencil_u = Symbol(:tangential_, side, :_stencil_u)
-    tangential_stencil_v = Symbol(:tangential_, side, :_stencil_v)
+"""
+    function biased_weno_weights(ψ, scheme::WENO{N, FT}, args...)
 
-    biased_stencil_z = Symbol(side, :_stencil_z)
-    
-    @eval begin
-        @inline function $biased_weno_weights(ψ, scheme::WENO{N, FT}, args...) where {N, FT}
-            @inbounds begin
-                β = beta_loop(scheme, ψ, $biased_β)
-                    
-                if scheme isa ZWENO
-                    τ = global_smoothness_indicator(Val(N), β)
-                    α = zweno_alpha_loop(scheme, β, τ, $coeff, FT)
-                else
-                    α = js_alpha_loop(scheme, β, $coeff, FT)
-                end
-                return α ./ sum(α)
-            end
-        end
+Biased weno weights ω used to weight the WENO reconstruction of the different stencils. 
+We use here a Z-WENO formulation where
 
-        @inline function $biased_weno_weights(ijk, scheme::WENO{N, FT}, dir, ::VelocityStencil, u, v) where {N, FT}
-            @inbounds begin
-                i, j, k = ijk
-            
-                uₛ = $tangential_stencil_u(i, j, k, scheme, dir, u)
-                vₛ = $tangential_stencil_v(i, j, k, scheme, dir, v)
-                βᵤ = beta_loop(scheme, uₛ, $biased_β)
-                βᵥ = beta_loop(scheme, vₛ, $biased_β)
-
-                β  = beta_sum(scheme, βᵤ, βᵥ)
-
-                if scheme isa ZWENO
-                    τ = global_smoothness_indicator(Val(N), β)
-                    α = zweno_alpha_loop(scheme, β, τ, $coeff, FT)
-                else
-                    α = js_alpha_loop(scheme, β, $coeff, FT)
-                end
-                return α ./ sum(α)
-            end
-        end
-    end
+```math
+    α = C★ ⋅ (1 + τ² / (β + ϵ)²) 
+```
+
+where 
+- ``C★`` is the optimal weight that leads to an upwind reconstruction of order `N * 2 - 1`,
+- ``β`` is the smoothness indicator calculated by the `smoothness_indicator` function
+- ``τ`` is a global smoothness indicator, function of the ``β`` values, calculated by the `global_smoothness_indicator` function
+- ``ϵ`` is a regularization constant, typically equal to 1e-8
+
+The ``α`` values are normalized before returning
+"""
+@inline function biased_weno_weights(ψ, scheme::WENO{N, FT}, args...) where {N, FT}
+    β = beta_loop(scheme, ψ)
+                
+    τ = global_smoothness_indicator(Val(N), β)
+    α = zweno_alpha_loop(scheme, β, τ)
+
+    return α ./ sum(α)
+end
+
+@inline function biased_weno_weights(ijk, scheme::WENO{N, FT}, bias, dir, ::VelocityStencil, u, v) where {N, FT}
+    i, j, k = ijk
+    
+    uₛ = tangential_stencil_u(i, j, k, scheme, bias, dir, u)
+    vₛ = tangential_stencil_v(i, j, k, scheme, bias, dir, v)
+    βᵤ = beta_loop(scheme, uₛ)
+    βᵥ = beta_loop(scheme, vₛ)
+    β  =  beta_sum(scheme, βᵤ, βᵥ)
+
+    τ = global_smoothness_indicator(Val(N), β)
+    α = zweno_alpha_loop(scheme, β, τ)
+    
+    return α ./ sum(α)
 end
 
 """ 
-    calc_weno_stencil(buffer, shift, dir, func::Bool = false)
+    load_weno_stencil(buffer, shift, dir, func::Bool = false)
 
 Stencils for WENO reconstruction calculations
 
@@ -298,145 +363,183 @@ Examples
 ========
 
 ```jldoctest
-julia> using Oceananigans.Advection: calc_weno_stencil
+julia> using Oceananigans.Advection: load_weno_stencil
 
-julia> calc_weno_stencil(3, :left, :x)
-:(((ψ[i + -1, j, k], ψ[i + 0, j, k], ψ[i + 1, j, k]), (ψ[i + -2, j, k], ψ[i + -1, j, k], ψ[i + 0, j, k]), (ψ[i + -3, j, k], ψ[i + -2, j, k], ψ[i + -1, j, k])))
+julia> load_weno_stencil(3, :x)
+:((ψ[i + -3, j, k], ψ[i + -2, j, k], ψ[i + -1, j, k], ψ[i + 0, j, k], ψ[i + 1, j, k], ψ[i + 2, j, k]))
 
-julia> calc_weno_stencil(2, :right, :x)
-:(((ψ[i + 0, j, k], ψ[i + 1, j, k]), (ψ[i + -1, j, k], ψ[i + 0, j, k])))
+julia> load_weno_stencil(2, :x)
+:((ψ[i + -2, j, k], ψ[i + -1, j, k], ψ[i + 0, j, k], ψ[i + 1, j, k]))
 
 """
-@inline function calc_weno_stencil(buffer, shift, dir, func::Bool = false) 
-    N = buffer * 2
-    if shift != :none
-        N -=1
-    end
-    stencil_full = Vector(undef, buffer)
-    rng = 1:N
-    if shift == :right
-        rng = rng .+ 1
-    end
-    for stencil in 1:buffer
-        stencil_point = Vector(undef, buffer)
-        rngstencil = rng[stencil:stencil+buffer-1]
-        for (idx, n) in enumerate(rngstencil)
-            c = n - buffer - 1
-            if func 
-                stencil_point[idx] =  dir == :x ? 
-                                      :(@inbounds ψ(i + $c, j, k, args...)) :
-                                      dir == :y ?
-                                      :(@inbounds ψ(i, j + $c, k, args...)) :
-                                      :(@inbounds ψ(i, j, k + $c, args...))
-            else    
-                stencil_point[idx] =  dir == :x ? 
-                                      :(@inbounds ψ[i + $c, j, k]) :
-                                      dir == :y ?
-                                      :(@inbounds ψ[i, j + $c, k]) :
-                                      :(@inbounds ψ[i, j, k + $c])
-            end                
-        end
-        stencil_full[buffer - stencil + 1] = :($(stencil_point...), )
+@inline function load_weno_stencil(buffer, dir, func::Bool = false) 
+    N = buffer * 2 - 1
+    stencil = Vector(undef, N+1)
+
+    for (idx, c) in enumerate(-buffer:buffer-1)
+        if func 
+            stencil[idx] =  dir == :x ? 
+                            :(ψ(i + $c, j, k, args...)) :
+                            dir == :y ?
+                            :(ψ(i, j + $c, k, args...)) :
+                            :(ψ(i, j, k + $c, args...))
+        else    
+            stencil[idx] =  dir == :x ? 
+                            :(ψ[i + $c, j, k]) :
+                            dir == :y ?
+                            :(ψ[i, j + $c, k]) :
+                            :(ψ[i, j, k + $c])
+        end             
     end
-    return :($(stencil_full...),)
+
+    return :($(stencil...),)
 end
 
 # Stencils for left and right biased reconstruction ((ψ̅ᵢ₋ᵣ₊ⱼ for j in 0:k) for r in 0:k) to calculate v̂ᵣ = ∑ⱼ(cᵣⱼψ̅ᵢ₋ᵣ₊ⱼ) 
 # where `k = N - 1`. Coefficients (cᵣⱼ for j in 0:N) for stencil r are given by `coeff_side_p(scheme, Val(r), ...)`
-for side in (:left, :right), dir in (:x, :y, :z)
-    stencil = Symbol(side, :_stencil_, dir)
+for dir in (:x, :y, :z), (T, f) in zip((:Any, :Function), (false, true))
+    stencil = Symbol(:weno_stencil_, dir)
+    @eval begin
+        @inline function $stencil(i, j, k, ::WENO{2}, bias, ψ::$T, args...) 
+            S = @inbounds $(load_weno_stencil(2, dir, f))
+            return S₀₂(S, bias), S₁₂(S, bias)
+        end
 
-    for buffer in [2, 3, 4, 5, 6]
-        @eval begin
-            @inline $stencil(i, j, k, scheme::WENO{$buffer}, ψ, args...)           = @inbounds $(calc_weno_stencil(buffer, side, dir, false))
-            @inline $stencil(i, j, k, scheme::WENO{$buffer}, ψ::Function, args...) = @inbounds $(calc_weno_stencil(buffer, side, dir,  true))
+        @inline function $stencil(i, j, k, ::WENO{3}, bias, ψ::$T, args...) 
+            S = @inbounds $(load_weno_stencil(3, dir, f))
+            return S₀₃(S, bias), S₁₃(S, bias), S₂₃(S, bias)
+        end
+
+        @inline function $stencil(i, j, k, ::WENO{4}, bias, ψ::$T, args...) 
+            S = @inbounds $(load_weno_stencil(4, dir, f))
+            return S₀₄(S, bias), S₁₄(S, bias), S₂₄(S, bias), S₃₄(S, bias)
+        end
+
+        @inline function $stencil(i, j, k, ::WENO{5}, bias, ψ::$T, args...) 
+            S = @inbounds $(load_weno_stencil(5, dir, f))
+            return S₀₅(S, bias), S₁₅(S, bias), S₂₅(S, bias), S₃₅(S, bias), S₄₅(S, bias)
+        end
+
+        @inline function $stencil(i, j, k, ::WENO{6}, bias, ψ::$T, args...) 
+            S = @inbounds $(load_weno_stencil(6, dir, f))
+            return S₀₆(S, bias), S₁₆(S, bias), S₂₆(S, bias), S₃₆(S, bias), S₄₆(S, bias), S₅₆(S, bias)
         end
     end
 end
 
-# Stencil for vector invariant calculation of smoothness indicators in the horizontal direction
-# Parallel to the interpolation direction! (same as left/right stencil)
-using Oceananigans.Operators: ℑyᵃᶠᵃ, ℑxᶠᵃᵃ
+# WENO stencils
+@inline S₀₂(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[2], S[3]), (S[3], S[2]))
+@inline S₁₂(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[1], S[2]), (S[4], S[3]))
+
+@inline S₀₃(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[3], S[4], S[5]), (S[4], S[3], S[2]))
+@inline S₁₃(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[2], S[3], S[4]), (S[5], S[4], S[3]))
+@inline S₂₃(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[1], S[2], S[3]), (S[6], S[5], S[4]))
+
+@inline S₀₄(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[4], S[5], S[6], S[7]), (S[5], S[4], S[3], S[2]))
+@inline S₁₄(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[3], S[4], S[5], S[6]), (S[6], S[5], S[4], S[3]))
+@inline S₂₄(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[2], S[3], S[4], S[5]), (S[7], S[6], S[5], S[4]))
+@inline S₃₄(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[1], S[2], S[3], S[4]), (S[8], S[7], S[6], S[5]))
+
+@inline S₀₅(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[5], S[6], S[7], S[8], S[9]), (S[6],  S[5], S[4], S[3], S[2]))
+@inline S₁₅(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[4], S[5], S[6], S[7], S[8]), (S[7],  S[6], S[5], S[4], S[3]))
+@inline S₂₅(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[3], S[4], S[5], S[6], S[7]), (S[8],  S[7], S[6], S[5], S[4]))
+@inline S₃₅(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[2], S[3], S[4], S[5], S[6]), (S[9],  S[8], S[7], S[6], S[5]))
+@inline S₄₅(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[1], S[2], S[3], S[4], S[5]), (S[10], S[9], S[8], S[7], S[6]))
 
-@inline tangential_left_stencil_u(i, j, k, scheme, ::Val{1}, u) = @inbounds left_stencil_x(i, j, k, scheme, ℑyᵃᶠᵃ, u)
-@inline tangential_left_stencil_u(i, j, k, scheme, ::Val{2}, u) = @inbounds left_stencil_y(i, j, k, scheme, ℑyᵃᶠᵃ, u)
-@inline tangential_left_stencil_v(i, j, k, scheme, ::Val{1}, v) = @inbounds left_stencil_x(i, j, k, scheme, ℑxᶠᵃᵃ, v)
-@inline tangential_left_stencil_v(i, j, k, scheme, ::Val{2}, v) = @inbounds left_stencil_y(i, j, k, scheme, ℑxᶠᵃᵃ, v)
+@inline S₀₆(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[6], S[7], S[8], S[9], S[10], S[11]), (S[7],  S[6],  S[5],  S[4], S[3], S[2]))
+@inline S₁₆(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[5], S[6], S[7], S[8], S[9],  S[10]), (S[8],  S[7],  S[6],  S[5], S[4], S[3]))
+@inline S₂₆(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[4], S[5], S[6], S[7], S[8],  S[9]),  (S[9],  S[8],  S[7],  S[6], S[5], S[4]))
+@inline S₃₆(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[3], S[4], S[5], S[6], S[7],  S[8]),  (S[10], S[9],  S[8],  S[7], S[6], S[5]))
+@inline S₄₆(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[2], S[3], S[4], S[5], S[6],  S[7]),  (S[11], S[10], S[9],  S[8], S[7], S[6]))
+@inline S₅₆(S, bias) = @inbounds ifelse(bias isa LeftBias, (S[1], S[2], S[3], S[4], S[5],  S[6]),  (S[12], S[11], S[10], S[9], S[8], S[7]))
 
-@inline tangential_right_stencil_u(i, j, k, scheme, ::Val{1}, u) = @inbounds right_stencil_x(i, j, k, scheme, ℑyᵃᶠᵃ, u)
-@inline tangential_right_stencil_u(i, j, k, scheme, ::Val{2}, u) = @inbounds right_stencil_y(i, j, k, scheme, ℑyᵃᶠᵃ, u)
-@inline tangential_right_stencil_v(i, j, k, scheme, ::Val{1}, v) = @inbounds right_stencil_x(i, j, k, scheme, ℑxᶠᵃᵃ, v)
-@inline tangential_right_stencil_v(i, j, k, scheme, ::Val{2}, v) = @inbounds right_stencil_y(i, j, k, scheme, ℑxᶠᵃᵃ, v)
+# Stencil for vector invariant calculation of smoothness indicators in the horizontal direction
+# Parallel to the interpolation direction! (same as left/right stencil)
+@inline tangential_stencil_u(i, j, k, scheme, bias, ::Val{1}, u) = @inbounds weno_stencil_x(i, j, k, scheme, bias, ℑyᵃᶠᵃ, u)
+@inline tangential_stencil_u(i, j, k, scheme, bias, ::Val{2}, u) = @inbounds weno_stencil_y(i, j, k, scheme, bias, ℑyᵃᶠᵃ, u)
+@inline tangential_stencil_v(i, j, k, scheme, bias, ::Val{1}, v) = @inbounds weno_stencil_x(i, j, k, scheme, bias, ℑxᶠᵃᵃ, v)
+@inline tangential_stencil_v(i, j, k, scheme, bias, ::Val{2}, v) = @inbounds weno_stencil_y(i, j, k, scheme, bias, ℑxᶠᵃᵃ, v)
 
 # Trick to force compilation of Val(stencil-1) and avoid loops on the GPU
-@inline function metaprogrammed_stencil_sum(buffer)
+@inline function metaprogrammed_weno_reconstruction(buffer)
     elem = Vector(undef, buffer)
     for stencil = 1:buffer
-        elem[stencil] = :(@inbounds w[$stencil] * func(scheme, Val($(stencil-1)), ψ[$stencil], cT, Val(val), idx, loc))
+        elem[stencil] = :(ω[$stencil] * biased_p(scheme, bias, Val($(stencil-1)), ψ[$stencil], cT, Val(val), idx, loc))
     end
 
     return Expr(:call, :+, elem...)
 end
 
+"""
+    weno_reconstruction(scheme::WENO{buffer}, bias, ψ, ω, cT, val, idx, loc)
+
+`bias`ed reconstruction of stencils `ψ` for a WENO scheme of order `buffer * 2 - 1` weighted by WENO
+weights `ω`. `ψ` is a `Tuple` of `buffer` stencils of size `buffer` and `ω` is a `Tuple` of size `buffer`
+containing the computed weights for each of the reconstruction stencils. 
+
+The additional inputs are only used for stretched WENO directions that require the knowledge of the location `loc`
+and the index `idx`.
+
+The calculation of the reconstruction is metaprogrammed in the `metaprogrammed_weno_reconstruction` function which, for
+`buffer == 4` (seventh order WENO), unrolls to:
+
+```julia
+ψ̂ = ω[1] * biased_p(scheme, bias, Val(0), ψ[1], cT, Val(val), idx, loc) + 
+    ω[2] * biased_p(scheme, bias, Val(1), ψ[2], cT, Val(val), idx, loc) + 
+    ω[3] * biased_p(scheme, bias, Val(2), ψ[3], cT, Val(val), idx, loc) + 
+    ω[4] * biased_p(scheme, bias, Val(3), ψ[4], cT, Val(val), idx, loc))
+```
+
+Here, [`biased_p`](@ref) is the function that computes the linear reconstruction of the individual stencils.
+"""
+@inline weno_reconstruction(scheme, bias, ψ, args...) = zero(ψ[1][1]) # Fallback only for documentation purposes
+
 # Calculation of WENO reconstructed value v⋆ = ∑ᵣ(wᵣv̂ᵣ)
 for buffer in [2, 3, 4, 5, 6]
-    @eval begin
-        @inline stencil_sum(scheme::WENO{$buffer}, ψ, w, func, cT, val, idx, loc) = @inbounds $(metaprogrammed_stencil_sum(buffer))
-    end
+    @eval @inline weno_reconstruction(scheme::WENO{$buffer}, bias, ψ, ω, cT, val, idx, loc) = @inbounds $(metaprogrammed_weno_reconstruction(buffer))
 end
 
 # Interpolation functions
 for (interp, dir, val, cT) in zip([:xᶠᵃᵃ, :yᵃᶠᵃ, :zᵃᵃᶠ], [:x, :y, :z], [1, 2, 3], [:XT, :YT, :ZT]) 
-    for side in (:left, :right)
-        interpolate_func = Symbol(:inner_, side, :_biased_interpolate_, interp)
-        stencil          = Symbol(side, :_stencil_, dir)
-        weno_weights     = Symbol(side, :_biased_weno_weights)
-        biased_p         = Symbol(side, :_biased_p)
-        
-        @eval begin
-            @inline function $interpolate_func(i, j, k, grid, 
-                                               scheme::WENO{N, FT, XT, YT, ZT}, 
-                                               ψ, idx, loc, args...) where {N, FT, XT, YT, ZT}
-                @inbounds begin
-                    ψₜ = $stencil(i, j, k, scheme, ψ, grid, args...)
-                    w = $weno_weights(ψₜ, scheme, Val($val), Nothing, args...)
-                    return stencil_sum(scheme, ψₜ, w, $biased_p, $cT, $val, idx, loc)
-                end
-            end
-
-            @inline function $interpolate_func(i, j, k, grid, 
-                                               scheme::WENO{N, FT, XT, YT, ZT}, 
-                                               ψ, idx, loc, VI::AbstractSmoothnessStencil, args...) where {N, FT, XT, YT, ZT}
-
-                @inbounds begin
-                    ψₜ = $stencil(i, j, k, scheme, ψ, grid, args...)
-                    w = $weno_weights(ψₜ, scheme, Val($val), VI, args...)
-                    return stencil_sum(scheme, ψₜ, w, $biased_p, $cT, $val, idx, loc)
-                end
-            end
-
-            @inline function $interpolate_func(i, j, k, grid, 
-                                               scheme::WENO{N, FT, XT, YT, ZT}, 
-                                               ψ, idx, loc, VI::VelocityStencil, u, v, args...) where {N, FT, XT, YT, ZT}
-
-                @inbounds begin
-                    ψₜ = $stencil(i, j, k, scheme, ψ, grid, u, v, args...)
-                    w = $weno_weights((i, j, k), scheme, Val($val), VI, u, v)
-                    return stencil_sum(scheme, ψₜ, w, $biased_p, $cT, $val, idx, loc)
-                end
-            end
-
-            @inline function $interpolate_func(i, j, k, grid, 
-                                               scheme::WENO{N, FT, XT, YT, ZT}, 
-                                               ψ, idx, loc, VI::FunctionStencil, args...) where {N, FT, XT, YT, ZT}
-
-                @inbounds begin
-                    ψₜ = $stencil(i, j, k, scheme, ψ,       grid, args...)
-                    ψₛ = $stencil(i, j, k, scheme, VI.func, grid, args...)
-                    w = $weno_weights(ψₛ, scheme, Val($val), VI, args...)
-                    return stencil_sum(scheme, ψₜ, w, $biased_p, $cT, $val, idx, loc)
-                end
-            end
+    interpolate_func = Symbol(:inner_biased_interpolate_, interp)
+    stencil          = Symbol(:weno_stencil_, dir)
+    
+    @eval begin
+        @inline function $interpolate_func(i, j, k, grid, 
+                                            scheme::WENO{N, FT, XT, YT, ZT}, bias,
+                                            ψ, idx, loc, args...) where {N, FT, XT, YT, ZT}
+
+            ψₜ = $stencil(i, j, k, scheme, bias, ψ, grid, args...)
+            ω = biased_weno_weights(ψₜ, scheme, bias, Val($val), Nothing, args...)
+            return weno_reconstruction(scheme, bias, ψₜ, ω, $cT, $val, idx, loc)
+        end
+
+        @inline function $interpolate_func(i, j, k, grid, 
+                                            scheme::WENO{N, FT, XT, YT, ZT}, bias, 
+                                            ψ, idx, loc, VI::AbstractSmoothnessStencil, args...) where {N, FT, XT, YT, ZT}
+
+            ψₜ = $stencil(i, j, k, scheme, bias, ψ, grid, args...)
+            ω = biased_weno_weights(ψₜ, scheme, bias, Val($val), VI, args...)
+            return weno_reconstruction(scheme, bias, ψₜ, ω, $cT, $val, idx, loc)
+        end
+
+        @inline function $interpolate_func(i, j, k, grid, 
+                                            scheme::WENO{N, FT, XT, YT, ZT}, bias, 
+                                            ψ, idx, loc, VI::VelocityStencil, u, v, args...) where {N, FT, XT, YT, ZT}
+
+            ψₜ = $stencil(i, j, k, scheme, bias, ψ, grid, u, v, args...)
+            ω = biased_weno_weights((i, j, k), scheme, bias, Val($val), VI, u, v)
+            return weno_reconstruction(scheme, bias, ψₜ, ω, $cT, $val, idx, loc)
+        end
+
+        @inline function $interpolate_func(i, j, k, grid, 
+                                            scheme::WENO{N, FT, XT, YT, ZT}, bias, 
+                                            ψ, idx, loc, VI::FunctionStencil, args...) where {N, FT, XT, YT, ZT}
+
+            ψₜ = $stencil(i, j, k, scheme, bias, ψ,       grid, args...)
+            ψₛ = $stencil(i, j, k, scheme, bias, VI.func, grid, args...)
+            ω = biased_weno_weights(ψₛ, scheme, bias, Val($val), VI, args...)
+            return weno_reconstruction(scheme, bias, ψₜ, ω, $cT, $val, idx, loc)
         end
     end
 end
diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl
index 49e00f9a0b..5ed5a16adc 100644
--- a/src/Advection/weno_reconstruction.jl
+++ b/src/Advection/weno_reconstruction.jl
@@ -2,7 +2,7 @@
 ##### Weighted Essentially Non-Oscillatory (WENO) advection scheme
 #####
 
-struct WENO{N, FT, XT, YT, ZT, WF, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT}
+struct WENO{N, FT, XT, YT, ZT, PP, CA, SI} <: AbstractUpwindBiasedAdvectionScheme{N, FT}
     
     "Coefficient for ENO reconstruction on x-faces" 
     coeff_xᶠᵃᵃ::XT
@@ -25,16 +25,16 @@ struct WENO{N, FT, XT, YT, ZT, WF, PP, CA, SI} <: AbstractUpwindBiasedAdvectionS
     "Reconstruction scheme used for symmetric interpolation"
     advecting_velocity_scheme :: SI
 
-    function WENO{N, FT, WF}(coeff_xᶠᵃᵃ::XT, coeff_xᶜᵃᵃ::XT,
-                             coeff_yᵃᶠᵃ::YT, coeff_yᵃᶜᵃ::YT, 
-                             coeff_zᵃᵃᶠ::ZT, coeff_zᵃᵃᶜ::ZT,
-                             bounds::PP, buffer_scheme::CA,
-                             advecting_velocity_scheme :: SI) where {N, FT, XT, YT, ZT, WF, PP, CA, SI}
+    function WENO{N, FT}(coeff_xᶠᵃᵃ::XT, coeff_xᶜᵃᵃ::XT,
+                         coeff_yᵃᶠᵃ::YT, coeff_yᵃᶜᵃ::YT, 
+                         coeff_zᵃᵃᶠ::ZT, coeff_zᵃᵃᶜ::ZT,
+                         bounds::PP, buffer_scheme::CA,
+                         advecting_velocity_scheme :: SI) where {N, FT, XT, YT, ZT, PP, CA, SI}
 
-            return new{N, FT, XT, YT, ZT, WF, PP, CA, SI}(coeff_xᶠᵃᵃ, coeff_xᶜᵃᵃ, 
-                                                          coeff_yᵃᶠᵃ, coeff_yᵃᶜᵃ, 
-                                                          coeff_zᵃᵃᶠ, coeff_zᵃᵃᶜ,
-                                                          bounds, buffer_scheme, advecting_velocity_scheme)
+            return new{N, FT, XT, YT, ZT, PP, CA, SI}(coeff_xᶠᵃᵃ, coeff_xᶜᵃᵃ, 
+                                                      coeff_yᵃᶠᵃ, coeff_yᵃᶜᵃ, 
+                                                      coeff_zᵃᵃᶠ, coeff_zᵃᵃᶜ,
+                                                      bounds, buffer_scheme, advecting_velocity_scheme)
     end
 end
 
@@ -42,7 +42,6 @@ end
     WENO([FT=Float64;] 
          order = 5,
          grid = nothing, 
-         zweno = true, 
          bounds = nothing)
                
 Construct a weighted essentially non-oscillatory advection scheme of order `order`.
@@ -52,8 +51,6 @@ Keyword arguments
 
 - `order`: The order of the WENO advection scheme. Default: 5
 - `grid`: (defaults to `nothing`)
-- `zweno`: When `true` implement a Z-WENO formulation for the WENO weights calculation.
-           (defaults to `true`)
 
 Examples
 ========
@@ -62,8 +59,6 @@ julia> using Oceananigans
 
 julia> WENO()
 WENO reconstruction order 5
- Smoothness formulation: 
-    └── Z-weno  
  Boundary scheme: 
     └── WENO reconstruction order 3
  Symmetric scheme: 
@@ -88,8 +83,6 @@ julia> grid = RectilinearGrid(size = (Nx, Nz), halo = (4, 4), topology=(Periodic
 
 julia> WENO(grid; order=7)
 WENO reconstruction order 7
- Smoothness formulation: 
-    └── Z-weno  
  Boundary scheme: 
     └── WENO reconstruction order 5
  Symmetric scheme: 
@@ -103,7 +96,6 @@ WENO reconstruction order 7
 function WENO(FT::DataType=Float64; 
               order = 5,
               grid = nothing, 
-              zweno = true, 
               bounds = nothing)
     
     if !(grid isa Nothing) 
@@ -119,11 +111,11 @@ function WENO(FT::DataType=Float64;
         N  = Int((order + 1) ÷ 2)
 
         weno_coefficients = compute_reconstruction_coefficients(grid, FT, :WENO; order = N)
-        buffer_scheme     = WENO(FT; grid, order = order - 2, zweno, bounds)
+        buffer_scheme     = WENO(FT; grid, order = order - 2, bounds)
         advecting_velocity_scheme = Centered(FT; grid, order = order - 1)
     end
 
-    return WENO{N, FT, zweno}(weno_coefficients..., bounds, buffer_scheme, advecting_velocity_scheme)
+    return WENO{N, FT}(weno_coefficients..., bounds, buffer_scheme, advecting_velocity_scheme)
 end
 
 WENO(grid, FT::DataType=Float64; kwargs...) = WENO(FT; grid, kwargs...)
@@ -133,15 +125,12 @@ WENOThirdOrder(grid=nothing, FT::DataType=Float64;  kwargs...) = WENO(grid, FT;
 WENOFifthOrder(grid=nothing, FT::DataType=Float64;  kwargs...) = WENO(grid, FT; order=5, kwargs...)
 
 # Flavours of WENO
-const ZWENO        = WENO{<:Any, <:Any, <:Any, <:Any, <:Any, true}
-const PositiveWENO = WENO{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Tuple}
+const PositiveWENO = WENO{<:Any, <:Any, <:Any, <:Any, <:Any, <:Tuple}
 
 Base.summary(a::WENO{N}) where N = string("WENO reconstruction order ", N*2-1)
 
-Base.show(io::IO, a::WENO{N, FT, RX, RY, RZ, WF, PP}) where {N, FT, RX, RY, RZ, WF, PP} =
+Base.show(io::IO, a::WENO{N, FT, RX, RY, RZ, PP}) where {N, FT, RX, RY, RZ, PP} =
     print(io, summary(a), " \n",
-              " Smoothness formulation: ", "\n",
-              "    └── $(WF ? "Z-weno" : "JS-weno") \n",
               a.bounds isa Nothing ? "" : " Bounds : \n    └── $(a.bounds) \n",
               " Boundary scheme: ", "\n",
               "    └── ", summary(a.buffer_scheme) , "\n",
@@ -152,21 +141,21 @@ Base.show(io::IO, a::WENO{N, FT, RX, RY, RZ, WF, PP}) where {N, FT, RX, RY, RZ,
               "    ├── Y $(RY == Nothing ? "regular" : "stretched") \n",
               "    └── Z $(RZ == Nothing ? "regular" : "stretched")" )
 
-Adapt.adapt_structure(to, scheme::WENO{N, FT, XT, YT, ZT, WF, PP}) where {N, FT, XT, YT, ZT, WF, PP} =
-     WENO{N, FT, WF}(Adapt.adapt(to, scheme.coeff_xᶠᵃᵃ), Adapt.adapt(to, scheme.coeff_xᶜᵃᵃ),
-                     Adapt.adapt(to, scheme.coeff_yᵃᶠᵃ), Adapt.adapt(to, scheme.coeff_yᵃᶜᵃ),
-                     Adapt.adapt(to, scheme.coeff_zᵃᵃᶠ), Adapt.adapt(to, scheme.coeff_zᵃᵃᶜ),
-                     Adapt.adapt(to, scheme.bounds),
-                     Adapt.adapt(to, scheme.buffer_scheme),
-                     Adapt.adapt(to, scheme.advecting_velocity_scheme))
-
-on_architecture(to, scheme::WENO{N, FT, XT, YT, ZT, WF, PP}) where {N, FT, XT, YT, ZT, WF, PP} =
-    WENO{N, FT, WF}(on_architecture(to, scheme.coeff_xᶠᵃᵃ), on_architecture(to, scheme.coeff_xᶜᵃᵃ),
-                    on_architecture(to, scheme.coeff_yᵃᶠᵃ), on_architecture(to, scheme.coeff_yᵃᶜᵃ),
-                    on_architecture(to, scheme.coeff_zᵃᵃᶠ), on_architecture(to, scheme.coeff_zᵃᵃᶜ),
-                    on_architecture(to, scheme.bounds),
-                    on_architecture(to, scheme.buffer_scheme),
-                    on_architecture(to, scheme.advecting_velocity_scheme))
+Adapt.adapt_structure(to, scheme::WENO{N, FT, XT, YT, ZT, PP}) where {N, FT, XT, YT, ZT, PP} =
+     WENO{N, FT}(Adapt.adapt(to, scheme.coeff_xᶠᵃᵃ), Adapt.adapt(to, scheme.coeff_xᶜᵃᵃ),
+                 Adapt.adapt(to, scheme.coeff_yᵃᶠᵃ), Adapt.adapt(to, scheme.coeff_yᵃᶜᵃ),
+                 Adapt.adapt(to, scheme.coeff_zᵃᵃᶠ), Adapt.adapt(to, scheme.coeff_zᵃᵃᶜ),
+                 Adapt.adapt(to, scheme.bounds),
+                 Adapt.adapt(to, scheme.buffer_scheme),
+                 Adapt.adapt(to, scheme.advecting_velocity_scheme))
+
+on_architecture(to, scheme::WENO{N, FT, XT, YT, ZT, PP}) where {N, FT, XT, YT, ZT, PP} =
+    WENO{N, FT}(on_architecture(to, scheme.coeff_xᶠᵃᵃ), on_architecture(to, scheme.coeff_xᶜᵃᵃ),
+                on_architecture(to, scheme.coeff_yᵃᶠᵃ), on_architecture(to, scheme.coeff_yᵃᶜᵃ),
+                on_architecture(to, scheme.coeff_zᵃᵃᶠ), on_architecture(to, scheme.coeff_zᵃᵃᶜ),
+                on_architecture(to, scheme.bounds),
+                on_architecture(to, scheme.buffer_scheme),
+                on_architecture(to, scheme.advecting_velocity_scheme))
 
 # Retrieve precomputed coefficients (+2 for julia's 1 based indices)
 @inline retrieve_coeff(scheme::WENO, r, ::Val{1}, i, ::Type{Face})   = @inbounds scheme.coeff_xᶠᵃᵃ[r+2][i] 
diff --git a/src/Architectures.jl b/src/Architectures.jl
index 9b03fbf7fa..49743244f4 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -59,7 +59,7 @@ architecture(a::OffsetArray) = architecture(parent(a))
 Return `arch`itecture of child processes.
 On single-process, non-distributed systems, return `arch`.
 """
-child_architecture(arch) = arch
+child_architecture(arch::AbstractSerialArchitecture) = arch
 
 array_type(::CPU) = Array
 array_type(::GPU) = CuArray
@@ -95,15 +95,8 @@ cpu_architecture(::GPU) = CPU()
 unified_array(::CPU, a) = a
 unified_array(::GPU, a) = a
 
-function unified_array(::GPU, arr::AbstractArray) 
-    buf = Mem.alloc(Mem.Unified, sizeof(arr))
-    vec = unsafe_wrap(CuArray{eltype(arr),length(size(arr))}, convert(CuPtr{eltype(arr)}, buf), size(arr))
-    finalizer(vec) do _
-        Mem.free(buf)
-    end
-    copyto!(vec, arr)
-    return vec
-end
+# cu alters the type of `a`, so we convert it back to the correct type
+unified_array(::GPU, a::AbstractArray) = map(eltype(a), cu(a; unified = true))
 
 ## GPU to GPU copy of contiguous data
 @inline function device_copy_to!(dst::CuArray, src::CuArray; async::Bool = false) 
diff --git a/src/Biogeochemistry.jl b/src/Biogeochemistry.jl
index fe153f0701..21f7e27471 100644
--- a/src/Biogeochemistry.jl
+++ b/src/Biogeochemistry.jl
@@ -120,7 +120,7 @@ abstract type AbstractContinuousFormBiogeochemistry <: AbstractBiogeochemistry e
     return bgc(val_tracer_name, x, y, z, clock.time, fields_ijk...)
 end
 
-@inline (bgc::AbstractContinuousFormBiogeochemistry)(val_tracer_name, x, y, z, t, fields...) = zero(x)
+@inline (bgc::AbstractContinuousFormBiogeochemistry)(val_tracer_name, x, y, z, t, fields...) = zero(t)
 
 tracernames(tracers) = keys(tracers)
 tracernames(tracers::Tuple) = tracers
diff --git a/src/BoundaryConditions/BoundaryConditions.jl b/src/BoundaryConditions/BoundaryConditions.jl
index 13dd9a6e19..2011bd3df4 100644
--- a/src/BoundaryConditions/BoundaryConditions.jl
+++ b/src/BoundaryConditions/BoundaryConditions.jl
@@ -10,7 +10,7 @@ export
     apply_x_bcs!, apply_y_bcs!, apply_z_bcs!,
     fill_halo_regions!
 
-using CUDA
+using CUDA, Adapt
 using KernelAbstractions: @index, @kernel
 
 using Oceananigans.Architectures: CPU, GPU, device
@@ -18,6 +18,8 @@ using Oceananigans.Utils: work_layout, launch!
 using Oceananigans.Operators: Ax, Ay, Az, volume
 using Oceananigans.Grids
 
+import Adapt: adapt_structure
+
 include("boundary_condition_classifications.jl")
 include("boundary_condition.jl")
 include("discrete_boundary_function.jl")
@@ -34,4 +36,7 @@ include("fill_halo_regions_nothing.jl")
 
 include("apply_flux_bcs.jl")
 
+include("update_boundary_conditions.jl")
+
+include("flat_extrapolation_open_boundary_matching_scheme.jl")
 end # module
diff --git a/src/BoundaryConditions/boundary_condition.jl b/src/BoundaryConditions/boundary_condition.jl
index 86a81cbf25..384d0fb973 100644
--- a/src/BoundaryConditions/boundary_condition.jl
+++ b/src/BoundaryConditions/boundary_condition.jl
@@ -1,4 +1,3 @@
-import Adapt
 import Oceananigans.Architectures: on_architecture
 
 """
@@ -12,21 +11,12 @@ struct BoundaryCondition{C<:AbstractBoundaryConditionClassification, T}
 end
 
 """
-    BoundaryCondition(Classification::DataType, condition)
-
-Construct a boundary condition of type `BC` with a number or array as a `condition`.
-
-Boundary condition types include `Periodic`, `Flux`, `Value`, `Gradient`, and `Open`.
-"""
-BoundaryCondition(Classification::DataType, condition) = BoundaryCondition(Classification(), condition)
-
-"""
-    BoundaryCondition(Classification::DataType, condition::Function;
+    BoundaryCondition(classification::AbstractBoundaryConditionClassification, condition::Function;
                       parameters = nothing,
                       discrete_form = false,
                       field_dependencies=())
 
-Construct a boundary condition of type `Classification` with a function boundary `condition`.
+Construct a boundary condition of type `classification` with a function boundary `condition`.
 
 By default, the function boudnary `condition` is assumed to have the 'continuous form'
 `condition(ξ, η, t)`, where `t` is time and `ξ` and `η` vary along the boundary.
@@ -50,7 +40,7 @@ where `i`, and `j` are indices that vary along the boundary. If `discrete_form =
 condition(i, j, grid, clock, model_fields, parameters)
 ```
 """
-function BoundaryCondition(Classification::DataType, condition::Function;
+function BoundaryCondition(classification::AbstractBoundaryConditionClassification, condition::Function;
                            parameters = nothing,
                            discrete_form = false,
                            field_dependencies=())
@@ -63,17 +53,20 @@ function BoundaryCondition(Classification::DataType, condition::Function;
         condition = ContinuousBoundaryFunction(condition, parameters, field_dependencies)
     end
 
-    return BoundaryCondition(Classification(), condition)
+    return BoundaryCondition(classification, condition)
 end
 
-# Adapt boundary condition struct to be GPU friendly and passable to GPU kernels.
-Adapt.adapt_structure(to, b::BoundaryCondition{Classification}) where Classification =
-    BoundaryCondition(Classification(), Adapt.adapt(to, b.condition))
+# Convenience constructors for buondary condition passing classification types
+BoundaryCondition(Classification::DataType, args...; kwargs...) = BoundaryCondition(Classification(), args...; kwargs...)
+BoundaryCondition(::Type{Open}, args...; kwargs...)             = BoundaryCondition(Open(nothing),    args...; kwargs...)
 
+# Adapt boundary condition struct to be GPU friendly and passable to GPU kernels.
+Adapt.adapt_structure(to, b::BoundaryCondition) =
+    BoundaryCondition(Adapt.adapt(to, b.classification), Adapt.adapt(to, b.condition))
 
 # Adapt boundary condition struct to be GPU friendly and passable to GPU kernels.
-on_architecture(to, b::BoundaryCondition{Classification}) where Classification =
-    BoundaryCondition(Classification(), on_architecture(to, b.condition))
+on_architecture(to, b::BoundaryCondition) =
+    BoundaryCondition(on_architecture(to, b.classification), on_architecture(to, b.condition))
 
 #####
 ##### Some abbreviations to make life easier.
@@ -91,18 +84,18 @@ const MCBC = BoundaryCondition{<:MultiRegionCommunication}
 const DCBC = BoundaryCondition{<:DistributedCommunication}
 
 # More readable BC constructors for the public API.
-                PeriodicBoundaryCondition() = BoundaryCondition(Periodic,                 nothing)
-                  NoFluxBoundaryCondition() = BoundaryCondition(Flux,                     nothing)
-            ImpenetrableBoundaryCondition() = BoundaryCondition(Open,                     nothing)
-MultiRegionCommunicationBoundaryCondition() = BoundaryCondition(MultiRegionCommunication, nothing)
-DistributedCommunicationBoundaryCondition() = BoundaryCondition(DistributedCommunication, nothing)
-
-                    FluxBoundaryCondition(val; kwargs...) = BoundaryCondition(Flux, val; kwargs...)
-                   ValueBoundaryCondition(val; kwargs...) = BoundaryCondition(Value, val; kwargs...)
-                GradientBoundaryCondition(val; kwargs...) = BoundaryCondition(Gradient, val; kwargs...)
-                    OpenBoundaryCondition(val; kwargs...) = BoundaryCondition(Open, val; kwargs...)
-MultiRegionCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(MultiRegionCommunication, val; kwargs...)
-DistributedCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(DistributedCommunication, val; kwargs...)
+                PeriodicBoundaryCondition() = BoundaryCondition(Periodic(),                 nothing)
+                  NoFluxBoundaryCondition() = BoundaryCondition(Flux(),                     nothing)
+            ImpenetrableBoundaryCondition() = BoundaryCondition(Open(), nothing)
+MultiRegionCommunicationBoundaryCondition() = BoundaryCondition(MultiRegionCommunication(), nothing)
+DistributedCommunicationBoundaryCondition() = BoundaryCondition(DistributedCommunication(), nothing)
+
+                    FluxBoundaryCondition(val; kwargs...) = BoundaryCondition(Flux(), val; kwargs...)
+                   ValueBoundaryCondition(val; kwargs...) = BoundaryCondition(Value(), val; kwargs...)
+                GradientBoundaryCondition(val; kwargs...) = BoundaryCondition(Gradient(), val; kwargs...)
+                    OpenBoundaryCondition(val; kwargs...) = BoundaryCondition(Open(nothing), val; kwargs...)
+MultiRegionCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(MultiRegionCommunication(), val; kwargs...)
+DistributedCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(DistributedCommunication(), val; kwargs...)
 
 # Support for various types of boundary conditions.
 #
@@ -125,9 +118,6 @@ DistributedCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(Di
 const NumberRef = Base.RefValue{<:Number}
 @inline getbc(bc::BC{<:Any, <:NumberRef}, args...) = bc.condition[]
 
-Adapt.adapt_structure(to, bc::BoundaryCondition) = BoundaryCondition(Adapt.adapt(to, bc.classification),
-                                                                     Adapt.adapt(to, bc.condition))
-
 #####
 ##### Validation with topology
 #####
diff --git a/src/BoundaryConditions/boundary_condition_classifications.jl b/src/BoundaryConditions/boundary_condition_classifications.jl
index d3d97bb685..234e4f25eb 100644
--- a/src/BoundaryConditions/boundary_condition_classifications.jl
+++ b/src/BoundaryConditions/boundary_condition_classifications.jl
@@ -59,7 +59,15 @@ For fields located at `Faces`, `Open` also specifies field value _on_ the bounda
 Open boundary conditions are used to specify the component of a velocity field normal to a boundary
 and can also be used to describe nested or linked simulation domains.
 """
-struct Open <: AbstractBoundaryConditionClassification end
+struct Open{MS} <: AbstractBoundaryConditionClassification
+    matching_scheme::MS
+end
+
+Open() = Open(nothing)
+
+(open::Open)() = open
+
+Adapt.adapt_structure(to, open::Open) = Open(adapt(to, open.matching_scheme))
 
 """
     struct MultiRegionCommunication <: AbstractBoundaryConditionClassification
diff --git a/src/BoundaryConditions/continuous_boundary_function.jl b/src/BoundaryConditions/continuous_boundary_function.jl
index 3023dbd2ac..922739641b 100644
--- a/src/BoundaryConditions/continuous_boundary_function.jl
+++ b/src/BoundaryConditions/continuous_boundary_function.jl
@@ -88,7 +88,7 @@ function regularize_boundary_condition(bc::BoundaryCondition{C, <:ContinuousBoun
                                                                              boundary_func.field_dependencies,
                                                                              indices, interps)
 
-    return BoundaryCondition(C, regularized_boundary_func)
+    return BoundaryCondition(bc.classification, regularized_boundary_func)
 end
 
 @inline domain_boundary_indices(::LeftBoundary, N) = 1, 1
diff --git a/src/BoundaryConditions/field_boundary_conditions.jl b/src/BoundaryConditions/field_boundary_conditions.jl
index a12f1d0f63..16fff0a977 100644
--- a/src/BoundaryConditions/field_boundary_conditions.jl
+++ b/src/BoundaryConditions/field_boundary_conditions.jl
@@ -57,7 +57,10 @@ function FieldBoundaryConditions(indices::Tuple, west, east, south, north, botto
 end
 
 FieldBoundaryConditions(indices::Tuple, bcs::FieldBoundaryConditions) =
-    FieldBoundaryConditions(indices, (getproperty(bcs, side) for side in fieldnames(FieldBoundaryConditions))...)
+    FieldBoundaryConditions(indices, (getproperty(bcs, side) for side in propertynames(bcs))...)
+
+
+FieldBoundaryConditions(indices::Tuple, ::Nothing) = nothing
 
 window_boundary_conditions(::Colon,     left, right) = left, right
 window_boundary_conditions(::UnitRange, left, right) = nothing, nothing
@@ -132,15 +135,17 @@ and the topology in the boundary-normal direction is used:
 - `nothing` for `Bounded` directions and `Face`-located fields
 - `nothing` for `Flat` directions and/or `Nothing`-located fields
 """
-FieldBoundaryConditions(grid, location, indices=(:, :, :);
-                        west     = default_auxiliary_bc(topology(grid, 1)(), location[1]()),
-                        east     = default_auxiliary_bc(topology(grid, 1)(), location[1]()),
-                        south    = default_auxiliary_bc(topology(grid, 2)(), location[2]()),
-                        north    = default_auxiliary_bc(topology(grid, 2)(), location[2]()),
-                        bottom   = default_auxiliary_bc(topology(grid, 3)(), location[3]()),
-                        top      = default_auxiliary_bc(topology(grid, 3)(), location[3]()),
-                        immersed = NoFluxBoundaryCondition()) =
-    FieldBoundaryConditions(indices, west, east, south, north, bottom, top, immersed)
+function FieldBoundaryConditions(grid::AbstractGrid, location, indices=(:, :, :);
+                                 west     = default_auxiliary_bc(topology(grid, 1)(), location[1]()),
+                                 east     = default_auxiliary_bc(topology(grid, 1)(), location[1]()),
+                                 south    = default_auxiliary_bc(topology(grid, 2)(), location[2]()),
+                                 north    = default_auxiliary_bc(topology(grid, 2)(), location[2]()),
+                                 bottom   = default_auxiliary_bc(topology(grid, 3)(), location[3]()),
+                                 top      = default_auxiliary_bc(topology(grid, 3)(), location[3]()),
+                                 immersed = NoFluxBoundaryCondition())
+
+    return FieldBoundaryConditions(indices, west, east, south, north, bottom, top, immersed)
+end
 
 #####
 ##### Boundary condition "regularization"
@@ -151,11 +156,12 @@ FieldBoundaryConditions(grid, location, indices=(:, :, :);
 # Friendly warning?
 function regularize_immersed_boundary_condition(ibc, grid, loc, field_name, args...)
     if !(ibc isa DefaultBoundaryCondition)
-        msg = """
-              $field_name was assigned an immersed $ibc, but this is not supported on
-              $(summary(grid))
+        msg = """$field_name was assigned an immersed boundary condition
+              $ibc ,
+              but this is not supported on
+              $(summary(grid)) .
               The immersed boundary condition on $field_name will have no effect.
-          """
+              """
 
         @warn msg
     end
diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl
index cc92479f40..44d72c2bd3 100644
--- a/src/BoundaryConditions/fill_halo_regions.jl
+++ b/src/BoundaryConditions/fill_halo_regions.jl
@@ -45,10 +45,14 @@ end
 const MaybeTupledData = Union{OffsetArray, NTuple{<:Any, OffsetArray}}
 
 "Fill halo regions in ``x``, ``y``, and ``z`` for a given field's data."
-function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, loc, grid, args...; kwargs...)
-
+function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, loc, grid, args...; 
+                            fill_boundary_normal_velocities = true, kwargs...)
     arch = architecture(grid)
 
+    if fill_boundary_normal_velocities
+        fill_open_boundary_regions!(c, boundary_conditions, indices, loc, grid, args...; kwargs...)
+    end
+
     fill_halos!, bcs = permute_boundary_conditions(boundary_conditions)
     number_of_tasks  = length(fill_halos!)
 
@@ -208,7 +212,6 @@ end
     _fill_bottom_halo!(i, j, grid, c, bottom_bc, loc, args...)
        _fill_top_halo!(i, j, grid, c, top_bc,    loc, args...)
 end
-
 #####
 ##### Single-sided fill_halo! kernels
 #####
diff --git a/src/BoundaryConditions/fill_halo_regions_open.jl b/src/BoundaryConditions/fill_halo_regions_open.jl
index 75ff5eac05..152ebd29b7 100644
--- a/src/BoundaryConditions/fill_halo_regions_open.jl
+++ b/src/BoundaryConditions/fill_halo_regions_open.jl
@@ -1,39 +1,109 @@
-#####
-##### Outer functions for filling halo regions for open boundary conditions.
-#####
-
-# TODO: support true open boundary conditions.
-# For this we need to have separate functions for each of the six boundaries,
-# and need to unroll a loop over the boundary normal direction.
-# The syntax for `getbc` is also different for OpenBoundaryCondition than for others,
-# because the boundary-normal index can vary (and array boundary conditions need to be
-# 3D in general).
-
-@kernel function set_west_or_east_u!(u, i_boundary, bc, grid, args) 
+@inline fill_open_boundary_regions!(field, args...) = 
+    fill_open_boundary_regions!(field, field.boundary_conditions, field.indices, instantiated_location(field), field.grid)
+
+"""
+    fill_open_boundary_regions!(fields, boundary_conditions, indices, loc, grid, args...; kwargs...)
+
+Fill open boundary halo regions by filling boundary conditions on field faces with `open_fill`. 
+"""
+function fill_open_boundary_regions!(field, boundary_conditions, indices, loc, grid, args...; kwargs...)
+    arch = architecture(grid)
+
+    left_bc = left_velocity_open_boundary_condition(boundary_conditions, loc)
+    right_bc = right_velocity_open_boundary_condition(boundary_conditions, loc)
+
+    # gets `open_fill`, the function which fills open boundaries at `loc`, as well as `regular_fill`
+    # which is the function which fills non-open boundaries at `loc` which informs `fill_halo_size` 
+    open_fill, regular_fill = get_open_halo_filling_functions(loc) 
+    fill_size = fill_halo_size(field, regular_fill, indices, boundary_conditions, loc, grid)
+
+    launch!(arch, grid, fill_size, open_fill, field, left_bc, right_bc, loc, grid, args)
+
+    return nothing
+end
+
+fill_open_boundary_regions!(fields::NTuple, boundary_conditions, indices, loc, grid, args...; kwargs...) =
+    [fill_open_boundary_regions!(field, boundary_conditions[n], indices, loc[n], grid, args...; kwargs...) for (n, field) in enumerate(fields)]
+
+# for regular halo fills
+@inline left_velocity_open_boundary_condition(boundary_condition, loc) = nothing
+@inline left_velocity_open_boundary_condition(boundary_conditions, ::Tuple{Face, Center, Center}) = boundary_conditions.west
+@inline left_velocity_open_boundary_condition(boundary_conditions, ::Tuple{Center, Face, Center}) = boundary_conditions.south
+@inline left_velocity_open_boundary_condition(boundary_conditions, ::Tuple{Center, Center, Face}) = boundary_conditions.bottom
+
+@inline right_velocity_open_boundary_condition(boundary_conditions, loc) = nothing
+@inline right_velocity_open_boundary_condition(boundary_conditions, ::Tuple{Face, Center, Center}) = boundary_conditions.east
+@inline right_velocity_open_boundary_condition(boundary_conditions, ::Tuple{Center, Face, Center}) = boundary_conditions.north
+@inline right_velocity_open_boundary_condition(boundary_conditions, ::Tuple{Center, Center, Face}) = boundary_conditions.top
+
+# for multi region halo fills
+@inline left_velocity_open_boundary_condition(boundary_conditions::Tuple, ::Tuple{Face, Center, Center}) = @inbounds boundary_conditions[1]
+@inline left_velocity_open_boundary_condition(boundary_conditions::Tuple, ::Tuple{Center, Face, Center}) = @inbounds boundary_conditions[1]
+@inline left_velocity_open_boundary_condition(boundary_conditions::Tuple, ::Tuple{Center, Center, Face}) = @inbounds boundary_conditions[1]
+
+@inline right_velocity_open_boundary_condition(boundary_conditions::Tuple, ::Tuple{Face, Center, Center}) = @inbounds boundary_conditions[2]
+@inline right_velocity_open_boundary_condition(boundary_conditions::Tuple, ::Tuple{Center, Face, Center}) = @inbounds boundary_conditions[2]
+@inline right_velocity_open_boundary_condition(boundary_conditions::Tuple, ::Tuple{Center, Center, Face}) = @inbounds boundary_conditions[2]
+
+@inline get_open_halo_filling_functions(loc) = _no_fill!, _no_fill!
+@inline get_open_halo_filling_functions(::Tuple{Face, Center, Center}) = _fill_west_and_east_open_halo!, fill_west_and_east_halo!
+@inline get_open_halo_filling_functions(::Tuple{Center, Face, Center}) = _fill_south_and_north_open_halo!, fill_south_and_north_halo!
+@inline get_open_halo_filling_functions(::Tuple{Center, Center, Face}) = _fill_bottom_and_top_open_halo!, fill_bottom_and_top_halo!
+
+@kernel _no_fill!(args...) = nothing
+
+@inline fill_halo_size(field, ::typeof(_no_fill!), args...) = (0, 0)
+
+@kernel function _fill_west_and_east_open_halo!(c, west_bc, east_bc, loc, grid, args) 
     j, k = @index(Global, NTuple)
-    @inbounds u[i_boundary, j, k] = getbc(bc, j, k, grid, args...)
+    _fill_west_open_halo!(j, k, grid, c, west_bc, loc, args...)
+    _fill_east_open_halo!(j, k, grid, c, east_bc, loc, args...)
 end
 
-@kernel function set_south_or_north_v!(v, j_boundary, bc, grid, args)
+@kernel function _fill_south_and_north_open_halo!(c, south_bc, north_bc, loc, grid, args)
     i, k = @index(Global, NTuple)
-    @inbounds v[i, j_boundary, k] = getbc(bc, i, k, grid, args...)
+    _fill_south_open_halo!(i, k, grid, c, south_bc, loc, args...)
+    _fill_north_open_halo!(i, k, grid, c, north_bc, loc, args...)
 end
 
-@kernel function set_bottom_or_top_w!(w, k_boundary, bc, grid, args) 
+@kernel function _fill_bottom_and_top_open_halo!(c, bottom_bc, top_bc, loc, grid, args)
     i, j = @index(Global, NTuple)
-    @inbounds w[i, j, k_boundary] = getbc(bc, i, j, grid, args...)
+    _fill_bottom_open_halo!(i, j, grid, c, bottom_bc, loc, args...)
+       _fill_top_open_halo!(i, j, grid, c, top_bc,    loc, args...)
 end
 
-@inline   fill_west_halo!(u, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_west_or_east_u!,   u,           1, bc, grid, Tuple(args); kwargs...)
-@inline   fill_east_halo!(u, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_west_or_east_u!,   u, grid.Nx + 1, bc, grid, Tuple(args); kwargs...)
-@inline  fill_south_halo!(v, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_south_or_north_v!, v,           1, bc, grid, Tuple(args); kwargs...)
-@inline  fill_north_halo!(v, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_south_or_north_v!, v, grid.Ny + 1, bc, grid, Tuple(args); kwargs...)
-@inline fill_bottom_halo!(w, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_bottom_or_top_w!,  w,           1, bc, grid, Tuple(args); kwargs...)
-@inline    fill_top_halo!(w, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_bottom_or_top_w!,  w, grid.Nz + 1, bc, grid, Tuple(args); kwargs...)
-
-@inline   _fill_west_halo!(j, k, grid, c, bc::OBC, loc, args...) = @inbounds c[1, j, k]           = getbc(bc, j, k, grid, args...)
-@inline   _fill_east_halo!(j, k, grid, c, bc::OBC, loc, args...) = @inbounds c[grid.Nx + 1, j, k] = getbc(bc, j, k, grid, args...)
-@inline  _fill_south_halo!(i, k, grid, c, bc::OBC, loc, args...) = @inbounds c[i, 1, k]           = getbc(bc, i, k, grid, args...)
-@inline  _fill_north_halo!(i, k, grid, c, bc::OBC, loc, args...) = @inbounds c[i, grid.Ny + 1, k] = getbc(bc, i, k, grid, args...)
-@inline _fill_bottom_halo!(i, j, grid, c, bc::OBC, loc, args...) = @inbounds c[i, j, 1]           = getbc(bc, i, j, grid, args...)
-@inline    _fill_top_halo!(i, j, grid, c, bc::OBC, loc, args...) = @inbounds c[i, j, grid.Nz + 1] = getbc(bc, i, j, grid, args...)
+# Generic fallback
+
+@inline   _fill_west_open_halo!(j, k, grid, c, bc, loc, args...) = nothing
+@inline   _fill_east_open_halo!(j, k, grid, c, bc, loc, args...) = nothing
+@inline  _fill_south_open_halo!(i, k, grid, c, bc, loc, args...) = nothing
+@inline  _fill_north_open_halo!(i, k, grid, c, bc, loc, args...) = nothing
+@inline _fill_bottom_open_halo!(i, j, grid, c, bc, loc, args...) = nothing
+@inline    _fill_top_open_halo!(i, j, grid, c, bc, loc, args...) = nothing
+
+# Open boundary condition fallback
+
+@inline   _fill_west_open_halo!(j, k, grid, c, bc::OBC, loc, args...) = @inbounds c[1, j, k]           = getbc(bc, j, k, grid, args...)
+@inline   _fill_east_open_halo!(j, k, grid, c, bc::OBC, loc, args...) = @inbounds c[grid.Nx + 1, j, k] = getbc(bc, j, k, grid, args...)
+@inline  _fill_south_open_halo!(i, k, grid, c, bc::OBC, loc, args...) = @inbounds c[i, 1, k]           = getbc(bc, i, k, grid, args...)
+@inline  _fill_north_open_halo!(i, k, grid, c, bc::OBC, loc, args...) = @inbounds c[i, grid.Ny + 1, k] = getbc(bc, i, k, grid, args...)
+@inline _fill_bottom_open_halo!(i, j, grid, c, bc::OBC, loc, args...) = @inbounds c[i, j, 1]           = getbc(bc, i, j, grid, args...)
+@inline    _fill_top_open_halo!(i, j, grid, c, bc::OBC, loc, args...) = @inbounds c[i, j, grid.Nz + 1] = getbc(bc, i, j, grid, args...)
+
+# Regular boundary fill defaults
+
+@inline   _fill_west_halo!(j, k, grid, c, bc::OBC, loc, args...) = _fill_west_open_halo!(j, k, grid, c, bc, loc, args...)
+@inline   _fill_east_halo!(j, k, grid, c, bc::OBC, loc, args...) = _fill_east_open_halo!(j, k, grid, c, bc, loc, args...)
+@inline  _fill_south_halo!(i, k, grid, c, bc::OBC, loc, args...) = _fill_south_open_halo!(i, k, grid, c, bc, loc, args...)
+@inline  _fill_north_halo!(i, k, grid, c, bc::OBC, loc, args...) = _fill_north_open_halo!(i, k, grid, c, bc, loc, args...)
+@inline _fill_bottom_halo!(i, j, grid, c, bc::OBC, loc, args...) = _fill_bottom_open_halo!(i, j, grid, c, bc, loc, args...)
+@inline    _fill_top_halo!(i, j, grid, c, bc::OBC, loc, args...) = _fill_top_open_halo!(i, j, grid, c, bc, loc, args...)
+
+# Regular boundary fill for wall normal velocities
+
+@inline   _fill_west_halo!(j, k, grid, c, bc::OBC, ::Tuple{Face, <:Any, <:Any}, args...) = nothing
+@inline   _fill_east_halo!(j, k, grid, c, bc::OBC, ::Tuple{Face, <:Any, <:Any}, args...) = nothing
+@inline  _fill_south_halo!(i, k, grid, c, bc::OBC, ::Tuple{<:Any, Face, <:Any}, args...) = nothing
+@inline  _fill_north_halo!(i, k, grid, c, bc::OBC, ::Tuple{<:Any, Face, <:Any}, args...) = nothing
+@inline _fill_bottom_halo!(i, j, grid, c, bc::OBC, ::Tuple{<:Any, <:Any, Face}, args...) = nothing
+@inline    _fill_top_halo!(i, j, grid, c, bc::OBC, ::Tuple{<:Any, <:Any, Face}, args...) = nothing
diff --git a/src/BoundaryConditions/flat_extrapolation_open_boundary_matching_scheme.jl b/src/BoundaryConditions/flat_extrapolation_open_boundary_matching_scheme.jl
new file mode 100644
index 0000000000..a5bc6435c7
--- /dev/null
+++ b/src/BoundaryConditions/flat_extrapolation_open_boundary_matching_scheme.jl
@@ -0,0 +1,152 @@
+using Oceananigans.Grids: xspacing, yspacing, zspacing
+
+"""
+    FlatExtrapolation
+
+Zero gradient perpendicular velocity boundary condition.
+
+We find the boundary value by Taylor expanding the gradient at the boundary point (`xᵢ`)
+to second order:
+```math
+f′(xᵢ) ≈ f′(xᵢ₋₁) + f′′(xᵢ₋₁)(xᵢ₋₁ - xᵢ) + O(Δx²) = f′(xᵢ₋₁) + f′′(xᵢ₋₁)Δx + O(Δx²),
+```
+where ``Δx=xᵢ₋₁ - xᵢ`` (for simplicity, we will also assume the spacing is constant at
+all ``i`` for now).
+We can substitute the gradient at some point ``j`` (``f′(xⱼ)``) with the central 
+difference approximation:
+```math
+f′(xⱼ) ≈ (f(xⱼ₊₁) - f(xⱼ₋₁)) / 2Δx,
+```
+and the second derivative at some point ``j`` (``f′′(xⱼ)``) can be approximated as:
+```math
+f′′(xⱼ) ≈ (f′(xⱼ₊₁) - f′(xⱼ₋₁)) / 2Δx = ((f(xⱼ₊₂) - f(xⱼ)) - (f(xⱼ) - f(xⱼ₋₂))) / (2Δx)².
+```
+When we then substitute for the boundary adjacent point ``f′′(xᵢ₋₁)`` we know that 
+``f′(xⱼ₊₁)=f′(xᵢ)=0`` so the Taylor expansion becomes:
+```math
+f(xᵢ) ≈ f(xᵢ₋₂) - (f(xᵢ₋₁) - f(xᵢ₋₃))/2 + O(Δx²).
+```
+
+When the grid spacing is not constant the above can be repeated resulting in the factor 
+of 1/2 changes to ``Δx₋₁/(Δx₋₂ + Δx₋₃)`` instead, i.e.:
+```math
+f(xᵢ) ≈ f(xᵢ₋₂) - (f(xᵢ₋₁) - f(xᵢ₋₃))Δxᵢ₋₁/(Δxᵢ₋₂ + Δxᵢ₋₃) + O(Δx²)
+```.
+"""
+struct FlatExtrapolation{FT}
+    relaxation_timescale :: FT
+end
+
+const FEOBC = BoundaryCondition{<:Open{<:FlatExtrapolation}}
+
+function FlatExtrapolationOpenBoundaryCondition(val = nothing; relaxation_timescale = Inf, kwargs...)
+    classification = Open(FlatExtrapolation(relaxation_timescale))
+    
+    return BoundaryCondition(classification, val; kwargs...)
+end
+
+@inline function relax(l, m, grid, ϕ, bc, clock, model_fields)
+    Δt = clock.last_stage_Δt 
+    τ = bc.classification.matching_scheme.relaxation_timescale
+
+    Δt̄ = min(1, Δt / τ)
+    ϕₑₓₜ = getbc(bc, l, m, grid, clock, model_fields)
+
+    Δϕ = (ϕₑₓₜ - ϕ) * Δt̄
+    not_relaxing = isnothing(bc.condition) | !isfinite(clock.last_stage_Δt)
+    Δϕ =  ifelse(not_relaxing, zero(ϕ), Δϕ)
+
+    return ϕ + Δϕ
+end
+
+const c = Center()
+
+@inline function _fill_west_open_halo!(j, k, grid, ϕ, bc::FEOBC, loc, clock, model_fields)
+    Δx₁ = xspacing(1, j, k, grid, c, c, c)
+    Δx₂ = xspacing(2, j, k, grid, c, c, c)
+    Δx₃ = xspacing(3, j, k, grid, c, c, c)
+
+    spacing_factor = Δx₁ / (Δx₂ + Δx₃)
+
+    gradient_free_ϕ = @inbounds ϕ[3, j, k] - (ϕ[2, j, k] - ϕ[4, j, k]) * spacing_factor
+
+    @inbounds ϕ[1, j, k] = relax(j, k, grid, gradient_free_ϕ, bc, clock, model_fields)
+
+    return nothing
+end
+
+@inline function _fill_east_open_halo!(j, k, grid, ϕ, bc::FEOBC, loc, clock, model_fields)
+    i = grid.Nx + 1
+
+    Δx₁ = xspacing(i-1, j, k, grid, c, c, c)
+    Δx₂ = xspacing(i-2, j, k, grid, c, c, c)
+    Δx₃ = xspacing(i-3, j, k, grid, c, c, c)
+
+    spacing_factor = Δx₁ / (Δx₂ + Δx₃)
+
+    gradient_free_ϕ = @inbounds ϕ[i - 2, j, k] - (ϕ[i - 1, j, k] - ϕ[i - 3, j, k]) * spacing_factor
+
+    @inbounds ϕ[i, j, k] = relax(j, k, grid, gradient_free_ϕ, bc, clock, model_fields)
+
+    return nothing
+end
+
+@inline function _fill_south_open_halo!(i, k, grid, ϕ, bc::FEOBC, loc, clock, model_fields)
+    Δy₁ = yspacing(i, 1, k, grid, c, c, c)
+    Δy₂ = yspacing(i, 2, k, grid, c, c, c)
+    Δy₃ = yspacing(i, 3, k, grid, c, c, c)
+
+    spacing_factor = Δy₁ / (Δy₂ + Δy₃)
+
+    gradient_free_ϕ = ϕ[i, 3, k] - (ϕ[i, 2, k] - ϕ[i, 4, k]) * spacing_factor
+
+    @inbounds ϕ[i, 1, k] = relax(i, k, grid, gradient_free_ϕ, bc, clock, model_fields)
+    
+    return nothing
+end
+
+@inline function _fill_north_open_halo!(i, k, grid, ϕ, bc::FEOBC, loc, clock, model_fields)
+    j = grid.Ny + 1
+
+    Δy₁ = yspacing(i, j-1, k, grid, c, c, c)
+    Δy₂ = yspacing(i, j-2, k, grid, c, c, c)
+    Δy₃ = yspacing(i, j-3, k, grid, c, c, c)
+
+    spacing_factor = Δy₁ / (Δy₂ + Δy₃)
+
+    gradient_free_ϕ = @inbounds ϕ[i, j - 2, k] - (ϕ[i, j - 1, k] - ϕ[i, j - 3, k]) * spacing_factor
+
+    @inbounds ϕ[i, j, k] = relax(i, k, grid, gradient_free_ϕ, bc, clock, model_fields)
+
+    return nothing
+end
+
+@inline function _fill_bottom_open_halo!(i, j, grid, ϕ, bc::FEOBC, loc, clock, model_fields)
+    Δz₁ = zspacing(i, j, 1, grid, c, c, c)
+    Δz₂ = zspacing(i, j, 2, grid, c, c, c)
+    Δz₃ = zspacing(i, j, 3, grid, c, c, c)
+
+    spacing_factor = Δz₁ / (Δz₂ + Δz₃)
+
+    gradient_free_ϕ = @inbounds ϕ[i, j, 3] - (ϕ[i, k, 2] - ϕ[i, j, 4]) * spacing_factor
+
+    @inbounds ϕ[i, j, 1] = relax(i, j, grid, gradient_free_ϕ, bc, clock, model_fields)
+
+    return nothing
+end
+
+@inline function _fill_top_open_halo!(i, j, grid, ϕ, bc::FEOBC, loc, clock, model_fields)
+    k = grid.Nz + 1
+
+    Δz₁ = zspacing(i, j, k-1, grid, c, c, c)
+    Δz₂ = zspacing(i, j, k-2, grid, c, c, c)
+    Δz₃ = zspacing(i, j, k-3, grid, c, c, c)
+
+    spacing_factor = Δz₁ / (Δz₂ + Δz₃)
+
+    gradient_free_ϕ = @inbounds ϕ[i, j, k - 2] - (ϕ[i, j, k - 1] - ϕ[i, j, k - 3]) * spacing_factor
+
+    @inbounds ϕ[i, j, k] = relax(i, j, grid, gradient_free_ϕ, bc, clock, model_fields)
+
+    return nothing
+end
\ No newline at end of file
diff --git a/src/BoundaryConditions/show_boundary_conditions.jl b/src/BoundaryConditions/show_boundary_conditions.jl
index e6ab12a6bd..24a568f057 100644
--- a/src/BoundaryConditions/show_boundary_conditions.jl
+++ b/src/BoundaryConditions/show_boundary_conditions.jl
@@ -4,29 +4,30 @@ using Oceananigans.Utils: prettysummary
 const DFBC = DefaultBoundaryCondition
 const IBC = BoundaryCondition{Open, Nothing} # ImpenetrableBoundaryCondition
 
-bc_str(::FBC)     = "Flux"
-bc_str(::PBC)     = "Periodic"
-bc_str(::OBC)     = "Open"
-bc_str(::VBC)     = "Value"
-bc_str(::GBC)     = "Gradient"
-bc_str(::ZFBC)    = "ZeroFlux"
-bc_str(::IBC)     = "Impenetrable"
-bc_str(::DFBC)    = "Default"
-bc_str(::MCBC)    = "MultiRegionCommunication"
-bc_str(::DCBC)    = "DistributedCommunication"
-bc_str(::Nothing) = "Nothing"
+bc_str(::FBC)                    = "Flux"
+bc_str(::PBC)                    = "Periodic"
+bc_str(::OBC{Open{MS}}) where MS = "Open{$MS}"
+bc_str(::VBC)                    = "Value"
+bc_str(::GBC)                    = "Gradient"
+bc_str(::ZFBC)                   = "ZeroFlux"
+bc_str(::IBC)                    = "Impenetrable"
+bc_str(::DFBC)                   = "Default"
+bc_str(::MCBC)                   = "MultiRegionCommunication"
+bc_str(::DCBC)                   = "DistributedCommunication"
+bc_str(::Nothing)                = "Nothing"
 
 #####
 ##### BoundaryCondition
 #####
 
-Base.summary(bc::DFBC) = string("DefaultBoundaryCondition (", summary(bc.boundary_condition), ")")
-Base.summary(bc::OBC)  = string("OpenBoundaryCondition: ", prettysummary(bc.condition))
-Base.summary(bc::FBC)  = string("FluxBoundaryCondition: ", prettysummary(bc.condition))
-Base.summary(bc::VBC)  = string("ValueBoundaryCondition: ", prettysummary(bc.condition))
-Base.summary(bc::GBC)  = string("GradientBoundaryCondition: ", prettysummary(bc.condition))
-Base.summary(::PBC)    = string("PeriodicBoundaryCondition")
-Base.summary(bc::DCBC) = string("DistributedBoundaryCondition: ", prettysummary(bc.condition))
+Base.summary(bc::DFBC)                    = string("DefaultBoundaryCondition (", summary(bc.boundary_condition), ")")
+Base.summary(bc::OBC{Open{MS}}) where MS  = string("OpenBoundaryCondition{$MS}: ", prettysummary(bc.condition))
+Base.summary(bc::IBC)                     = string("ImpenetrableBoundaryCondition")
+Base.summary(bc::FBC)                     = string("FluxBoundaryCondition: ", prettysummary(bc.condition))
+Base.summary(bc::VBC)                     = string("ValueBoundaryCondition: ", prettysummary(bc.condition))
+Base.summary(bc::GBC)                     = string("GradientBoundaryCondition: ", prettysummary(bc.condition))
+Base.summary(::PBC)                       = string("PeriodicBoundaryCondition")
+Base.summary(bc::DCBC)                    = string("DistributedBoundaryCondition: ", prettysummary(bc.condition))
 
 show(io::IO, bc::BoundaryCondition) = print(io, summary(bc))
 
diff --git a/src/BoundaryConditions/update_boundary_conditions.jl b/src/BoundaryConditions/update_boundary_conditions.jl
new file mode 100644
index 0000000000..241de5825a
--- /dev/null
+++ b/src/BoundaryConditions/update_boundary_conditions.jl
@@ -0,0 +1,27 @@
+using Oceananigans: boundary_conditions
+
+@inline update_boundary_condition!(bc, args...) = nothing
+
+function update_boundary_condition!(bcs::FieldBoundaryConditions, field, model)
+    update_boundary_condition!(bcs.west, Val(:west), field, model)
+    update_boundary_condition!(bcs.east, Val(:east), field, model)
+    update_boundary_condition!(bcs.south, Val(:south), field, model)
+    update_boundary_condition!(bcs.north, Val(:north), field, model)
+    update_boundary_condition!(bcs.bottom, Val(:bottom), field, model)
+    update_boundary_condition!(bcs.top, Val(:top), field, model)
+    update_boundary_condition!(bcs.immersed, Val(:immersed), field, model)
+    return nothing
+end
+
+update_boundary_condition!(fields::NamedTuple, model) = update_boundary_condition!(values(fields), model)
+
+function update_boundary_condition!(fields::Tuple, model)
+    N = length(fields)
+    ntuple(Val(N)) do n
+        field = fields[n]
+        bcs = boundary_conditions(field)
+        update_boundary_condition!(bcs, field, model)
+    end
+
+    return nothing
+end
diff --git a/src/BuoyancyModels/buoyancy.jl b/src/BuoyancyModels/buoyancy.jl
index 8acf985821..3754cb4a4d 100644
--- a/src/BuoyancyModels/buoyancy.jl
+++ b/src/BuoyancyModels/buoyancy.jl
@@ -31,11 +31,11 @@ model = NonhydrostaticModel(; grid, buoyancy, tracers=:b)
 
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 1×8×8 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 1×3×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: b
 ├── closure: Nothing
-├── buoyancy: BuoyancyTracer with ĝ = Tuple{Float64, Float64, Float64}
+├── buoyancy: BuoyancyTracer with ĝ = (0.0, -0.707107, -0.707107)
 └── coriolis: Nothing
 ```
 """
@@ -70,6 +70,18 @@ end
 regularize_buoyancy(b) = b
 regularize_buoyancy(b::AbstractBuoyancyModel) = Buoyancy(model=b)
 
-Base.summary(buoyancy::Buoyancy) = string(summary(buoyancy.model), " with ĝ = ", summary(buoyancy.gravity_unit_vector))
+Base.summary(buoyancy::Buoyancy) = string(summary(buoyancy.model),
+                                          " with ĝ = ",
+                                          summarize_vector(buoyancy.gravity_unit_vector))
 
-Base.show(io::IO, buoyancy::Buoyancy) = print(io, sprint(show, buoyancy.model), "\nwith `gravity_unit_vector` = ", summary(buoyancy.gravity_unit_vector))
+summarize_vector(n) = string("(", prettysummary(n[1]), ", ",
+                                  prettysummary(n[2]), ", ",
+                                  prettysummary(n[3]), ")")
+                             
+summarize_vector(::NegativeZDirection) = "NegativeZDirection()"
+
+function Base.show(io::IO, buoyancy::Buoyancy)
+    print(io, "Buoyancy:", '\n',
+              "├── model: ", prettysummary(buoyancy.model), '\n',
+              "└── gravity_unit_vector: ", summarize_vector(buoyancy.gravity_unit_vector))
+end
diff --git a/src/DistributedComputations/DistributedComputations.jl b/src/DistributedComputations/DistributedComputations.jl
index b1e224c607..205997ce24 100644
--- a/src/DistributedComputations/DistributedComputations.jl
+++ b/src/DistributedComputations/DistributedComputations.jl
@@ -2,7 +2,7 @@ module DistributedComputations
 
 export
     Distributed, Partition, Equal, Fractional, 
-    child_architecture, reconstruct_global_grid, 
+    child_architecture, reconstruct_global_grid, partition,
     inject_halo_communication_boundary_conditions,
     DistributedFFTBasedPoissonSolver
 
@@ -19,6 +19,10 @@ include("distributed_kernel_launching.jl")
 include("halo_communication_bcs.jl")
 include("distributed_fields.jl")
 include("halo_communication.jl")
+include("transposable_field.jl")
+include("distributed_transpose.jl")
+include("plan_distributed_transforms.jl")
 include("distributed_fft_based_poisson_solver.jl")
+include("distributed_fft_tridiagonal_solver.jl")
 
 end # module
diff --git a/src/DistributedComputations/distributed_architectures.jl b/src/DistributedComputations/distributed_architectures.jl
index d8bdc533fb..eca877fc8c 100644
--- a/src/DistributedComputations/distributed_architectures.jl
+++ b/src/DistributedComputations/distributed_architectures.jl
@@ -24,7 +24,7 @@ end
 Return `Partition` representing the division of a domain in
 the `x` (first), `y` (second) and `z` (third) dimension
 
-Keyword arguments: 
+Keyword arguments:
 ==================
 
 - `x`: partitioning of the first dimension 
@@ -49,12 +49,12 @@ Examples:
 julia> using Oceananigans; using Oceananigans.DistributedComputations
 
 julia> Partition(1, 4)
-Domain partitioning with (1, 4, 1) ranks
-└── y-partitioning: 4
+Partition across 4 = 1×4×1 ranks:
+└── y: 4
 
 julia> Partition(x = Fractional(1, 2, 3, 4))
-Domain partitioning with (4, 1, 1) ranks
-└── x-partitioning: domain fractions: (0.1, 0.2, 0.3, 0.4)
+Partition across 4 = 4×1×1 ranks:
+└── x: Fractional(0.1, 0.2, 0.3, 0.4)
 
 ```
 """
@@ -63,16 +63,40 @@ Partition(x, y) = Partition(validate_partition(x, y, nothing)...)
 
 Partition(; x = nothing, y = nothing, z = nothing) = Partition(validate_partition(x, y, z)...)
 
-Base.show(io::IO, p::Partition) =
-    print(io, 
-    "Domain partitioning with $(ranks(p)) ranks", "\n",
-    "$(ranks(p.x) > 1 ? spine_x(p) * " x-partitioning: $(p.x)\n" : "")", 
-    "$(ranks(p.y) > 1 ? spine_y(p) * " y-partitioning: $(p.y)\n" : "")", 
-    "$(ranks(p.z) > 1 ? "└── z-partitioning: $(p.z)\n" : "")")
+function Base.show(io::IO, p::Partition)
+    r = Rx, Ry, Rz = ranks(p)
+    Nr = prod(r)
+    last_rank = Nr - 1
+
+    rank_info = if Nr == 1
+        "1 rank"
+    else
+        "$Nr = $Rx×$Ry×$Rz ranks:"
+    end
+
+    print(io, "Partition across ", rank_info)
+    
+    if Rx > 1
+        s = spine(Ry, Rz)
+        print(io, '\n')
+        print(io, s, " x: ", p.x)
+    end
+
+    if Ry > 1
+        s = spine(Rz)
+        print(io, '\n')
+        print(io, s, " y: ", p.y)
+    end
+
+    if Rz > 1
+        s = "└── "
+        print(io, '\n')
+        print(io, s, " z: ", p.z)
+    end
+end
+
+spine(ξ, η=1) = ξ > 1 || η > 1 ? "├──" : "└──"
 
-spine_x(p) = ifelse(ranks(p.y) > 1 || ranks(p.z) > 1, "├──", "└──")
-spine_y(p) = ifelse(ranks(p.z) > 1, "├──", "└──")
- 
 """
     Equal()
 
@@ -111,11 +135,14 @@ Partition(x::Equal, y, z) = Partition(validate_partition(x, y, z)...)
 Partition(x, y::Equal, z) = Partition(validate_partition(x, y, z)...)
 Partition(x, y, z::Equal) = Partition(validate_partition(x, y, z)...)
 
-Base.show(io::IO, s::Sizes)      = print(io, "domain sizes:     $(s.sizes)")
-Base.show(io::IO, s::Fractional) = print(io, "domain fractions: $(s.sizes)")
+Base.summary(s::Sizes)      = string("Sizes", s.sizes)
+Base.summary(f::Fractional) = string("Fractional", f.sizes)
+
+Base.show(io::IO, s::Sizes)      = print(io, summary(s))
+Base.show(io::IO, f::Fractional) = print(io, summary(f))
 
 ranks(p::Partition)  = (ranks(p.x), ranks(p.y), ranks(p.z))
-ranks(r::Nothing)    = 1 # a direction not partitioned fits in 1 rank
+ranks(::Nothing)     = 1 # a direction not partitioned fits in 1 rank
 ranks(r::Int)        = r
 ranks(r::Sizes)      = length(r.sizes)
 ranks(r::Fractional) = length(r.sizes)
@@ -133,7 +160,8 @@ validate_partition(x, y, ::Equal) = x, y, remaining_workers(x, y)
 
 function remaining_workers(r1, r2)
     MPI.Initialized() || MPI.Init()    
-    return MPI.Comm_size(MPI.COMM_WORLD) ÷ (ranks(r1) * ranks(r2))
+    r12 = ranks(r1) * ranks(r2)
+    return MPI.Comm_size(MPI.COMM_WORLD) ÷ r12
 end
 
 struct Distributed{A, S, Δ, R, ρ, I, C, γ, M, T} <: AbstractArchitecture
@@ -173,10 +201,10 @@ end
 
 """
     Distributed(child_architecture = CPU(); 
-                communicator = MPI.COMM_WORLD,
+                partition = Partition(MPI.Comm_size(communicator)),
                 devices = nothing, 
-                synchronized_communication = false,
-                partition = Partition(MPI.Comm_size(communicator)))
+                communicator = MPI.COMM_WORLD,
+                synchronized_communication = false)
 
 Return a distributed architecture that uses MPI for communications.
 
@@ -189,43 +217,54 @@ Positional arguments
 Keyword arguments
 =================
 
-- `synchronized_communication`: If `true`, always use synchronized communication through ranks.
-                                Default: `false`.
-
 - `partition`: A [`Partition`](@ref) specifying the total processors in the `x`, `y`, and `z` direction.
                Note that support for distributed `z` direction is  limited; we strongly suggest
                using partition with `z = 1` kwarg.
 
 - `devices`: `GPU` device linked to local rank. The GPU will be assigned based on the 
              local node rank as such `devices[node_rank]`. Make sure to run `--ntasks-per-node` <= `--gres=gpu`.
-             If `nothing`, the devices will be assigned automatically based on the available resources
-
-- `communicator`: the MPI communicator, `MPI.COMM_WORLD`. This keyword argument should not be tampered with 
-                  if not for testing or developing. Change at your own risk!
+             If `nothing`, the devices will be assigned automatically based on the available resources.
+             This argument is irrelevant if `child_architecture = CPU()`.
+
+- `communicator`: the MPI communicator that orchestrates data transfer between nodes.
+                  Default: `MPI.COMM_WORLD`.
+
+- `synchronized_communication`: This keyword argument can be used to control downstream code behavior.
+                                If `true`, then downstream code may use this tag to toggle between an algorithm
+                                that permits communication between nodes "asynchronously" with other computations,
+                                and an alternative serial algorithm in which communication and computation are
+                                "synchronous" (that is, performed one after the other).
+                                Default: `false`, specifying the use of asynchronous algorithms where supported,
+                                which may result in faster time-to-solution.
 """
 function Distributed(child_architecture = CPU(); 
-                     communicator = nothing,
+                     partition = nothing,
                      devices = nothing, 
-                     synchronized_communication = false,
-                     partition = nothing)
+                     communicator = nothing,
+                     synchronized_communication = false)
 
     if !(MPI.Initialized())
         @info "MPI has not been initialized, so we are calling MPI.Init()."
         MPI.Init()
     end
 
-    communicator = isnothing(communicator) ? MPI.COMM_WORLD : communicator
-    partition    = isnothing(partition) ? Partition(MPI.Comm_size(communicator)) : partition
+    if isnothing(communicator) # default communicator
+        communicator = MPI.COMM_WORLD
+    end
 
-    ranks = size(partition)
-    Rx, Ry, Rz = ranks
-    total_ranks = Rx * Ry * Rz
-    mpi_ranks  = MPI.Comm_size(communicator)
+    mpi_ranks = MPI.Comm_size(communicator)
+
+    if isnothing(partition) # default partition
+        partition = Partition(mpi_ranks)
+    end
+
+    ranks = Rx, Ry, Rz = size(partition)
+    partition_ranks = Rx * Ry * Rz
 
     # TODO: make this error refer to `partition` (user input) rather than `ranks`
-    if total_ranks != mpi_ranks
-        throw(ArgumentError("Partition($Rx, $Ry, $Rz) [$total_ranks total ranks] inconsistent " *
-                            "with number of MPI ranks: $mpi_ranks."))
+    if partition_ranks != mpi_ranks
+        throw(ArgumentError("Partition($Rx, $Ry, $Rz) [$partition_ranks ranks] inconsistent " *
+                            "with $mpi_ranks MPI ranks"))
     end
 
     local_rank         = MPI.Comm_rank(communicator)
@@ -262,6 +301,8 @@ const SynchronizedDistributed = Distributed{<:Any, true}
 ##### All the architectures
 #####
 
+ranks(arch::Distributed) = ranks(arch.partition)
+
 child_architecture(arch::Distributed) = arch.child_architecture
 device(arch::Distributed)             = device(child_architecture(arch))
 
@@ -311,6 +352,8 @@ struct RankConnectivity{E, W, N, S, SW, SE, NW, NE}
     northeast :: NE
 end
 
+const NoConnectivity = RankConnectivity{Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}
+
 """
     RankConnectivity(; east, west, north, south, southwest, southeast, northwest, northeast)
 
@@ -370,17 +413,51 @@ end
 ##### Pretty printing
 #####
 
+function Base.summary(arch::Distributed)
+    child_arch = child_architecture(arch)
+    A = typeof(child_arch)
+    return string("Distributed{$A}")
+end
+
 function Base.show(io::IO, arch::Distributed)
+
+    Rx, Ry, Rz = arch.ranks
+    local_rank = arch.local_rank
+    Nr = prod(arch.ranks)
+    last_rank = Nr - 1
+
+    rank_info = if last_rank == 0
+        "1 rank:"
+    else
+        "$Nr = $Rx×$Ry×$Rz ranks:"
+    end
+
+    print(io, summary(arch), " across ", rank_info, '\n')
+    print(io, "├── local_rank: ", local_rank, " of 0-$last_rank", '\n')
+
+    ix, iy, iz = arch.local_index
+    index_info = string("index [$ix, $iy, $iz]")
+
     c = arch.connectivity
-    print(io, "Distributed architecture (rank $(arch.local_rank)/$(prod(arch.ranks)-1)) [index $(arch.local_index) / $(arch.ranks)]\n",
-              "└── child architecture: $(typeof(child_architecture(arch))) \n",
-              "└── connectivity:",
-              isnothing(c.east) ? "" : " east=$(c.east)",
-              isnothing(c.west) ? "" : " west=$(c.west)",
-              isnothing(c.north) ? "" : " north=$(c.north)",
-              isnothing(c.south) ? "" : " south=$(c.south)",
-              isnothing(c.southwest) ? "" : " southwest=$(c.southwest)",
-              isnothing(c.southeast) ? "" : " southeast=$(c.southeast)",
-              isnothing(c.northwest) ? "" : " northwest=$(c.northwest)",
-              isnothing(c.northeast) ? "" : " northeast=$(c.northeast)")
+    connectivity_info = if c isa NoConnectivity
+        nothing
+    else
+        string("└── connectivity:",
+               isnothing(c.east)      ? "" : " east=$(c.east)",
+               isnothing(c.west)      ? "" : " west=$(c.west)",
+               isnothing(c.north)     ? "" : " north=$(c.north)",
+               isnothing(c.south)     ? "" : " south=$(c.south)",
+               isnothing(c.southwest) ? "" : " southwest=$(c.southwest)",
+               isnothing(c.southeast) ? "" : " southeast=$(c.southeast)",
+               isnothing(c.northwest) ? "" : " northwest=$(c.northwest)",
+               isnothing(c.northeast) ? "" : " northeast=$(c.northeast)")
+    end
+
+    if isnothing(connectivity_info)
+        print(io, "└── local_index: [$ix, $iy, $iz]")
+    else
+        print(io, "├── local_index: [$ix, $iy, $iz]", '\n')
+        print(io, connectivity_info)
+    end
 end
+
diff --git a/src/DistributedComputations/distributed_fft_based_poisson_solver.jl b/src/DistributedComputations/distributed_fft_based_poisson_solver.jl
index 7199339f48..c73167e10d 100644
--- a/src/DistributedComputations/distributed_fft_based_poisson_solver.jl
+++ b/src/DistributedComputations/distributed_fft_based_poisson_solver.jl
@@ -1,28 +1,24 @@
-import PencilFFTs
-using PencilArrays: Permutation
-
 import FFTW 
 
+using CUDA: @allowscalar
+using Oceananigans.Grids: XYZRegularRG, XYRegularRG, XZRegularRG, YZRegularRG
+
 import Oceananigans.Solvers: poisson_eigenvalues, solve!
 import Oceananigans.Architectures: architecture
+import Oceananigans.Fields: interior
 
-struct DistributedFFTBasedPoissonSolver{P, F, L, λ, S, I}
+struct DistributedFFTBasedPoissonSolver{P, F, L, λ, B, S}
     plan :: P
     global_grid :: F
     local_grid :: L
     eigenvalues :: λ
+    buffer :: B
     storage :: S
-    input_permutation :: I
 end
 
 architecture(solver::DistributedFFTBasedPoissonSolver) =
     architecture(solver.global_grid)
 
-infer_transform(grid, d) = infer_transform(topology(grid, d)())
-infer_transform(::Periodic) = PencilFFTs.Transforms.FFT!()
-infer_transform(::Bounded)  = PencilFFTs.Transforms.R2R!(FFTW.REDFT10)
-infer_transform(::Flat)     = PencilFFTs.Transforms.NoTransform!()
-
 """
     DistributedFFTBasedPoissonSolver(global_grid, local_grid)
 
@@ -32,178 +28,202 @@ Return an FFT-based solver for the Poisson equation,
 ∇²φ = b
 ```
 
-on `Distributed`itectures.
+for `Distributed` architectures.
 
 Supported configurations
 ========================
 
-We support two "modes":
-
-  1. Vertical pencil decompositions: two-dimensional decompositions in ``(x, y)``
-     for three dimensional problems that satisfy either `Nz > Rx` or `Nz > Ry`.
+In the following, `Nx`, `Ny`, and `Nz` are the number of grid points of the **global** grid, 
+in the `x`, `y`, and `z` directions, while `Rx`, `Ry`, and `Rz` are the number of ranks in the
+`x`, `y`, and `z` directions, respectively. Furthermore, 'pencil' decomposition refers to a domain 
+decomposed in two different directions (i.e., with `Rx != 1` and `Ry != 1`), while 'slab' decomposition 
+refers to a domain decomposed only in one direction, (i.e., with either `Rx == 1` or `Ry == 1`).
+Additionally, `storage` indicates the `TransposableField` used for storing intermediate results;
+see [`TransposableField`](@ref).
 
-  2. One-dimensional decompositions in either ``x`` or ``y``.
+1. Three dimensional grids with pencil decompositions in ``(x, y)`` such the:
+the `z` direction is local, `Ny ≥ Rx` and `Ny % Rx = 0`, and `Nz ≥ Ry` and `Nz % Ry = 0`.
 
-Above, `Nz = size(global_grid, 3)` and `Rx, Ry, Rz = architecture(local_grid).ranks`.
+2. Two dimensional grids decomposed in ``x`` where `Ny ≥ Rx` and `Ny % Rx = 0`.
 
-Other configurations that are decomposed in ``(x, y)`` but have too few `Nz`,
-or any configuration decomposed in ``z``, are _not_ supported.
+!!! warning "Unsupported decompositions"
+    _Any_ configuration decomposed in ``z`` direction is _not_ supported.
+    Furthermore, any ``(x, y)`` decompositions other than the configurations mentioned above are also _not_ supported.
 
-Algorithm for two-dimensional decompositions
-============================================
+Algorithm for pencil decompositions
+===================================
 
-For two-dimensional decompositions (useful for three-dimensional problems),
-there are three forward transforms, three backward transforms,
-and four transpositions requiring MPI communication. In the schematic below, the first
-dimension is always the local dimension. In our implementation of the PencilFFTs algorithm,
-we require _either_ `Nz >= Rx`, _or_ `Nz >= Ry`, where `Nz` is the number of vertical cells,
-`Rx` is the number of ranks in ``x``, and `Ry` is the number of ranks in ``y``.
-Below, we outline the algorithm for the case `Nz >= Rx`.
-If `Nz < Rx`, but `Nz > Ry`, a similar algorithm applies with ``x`` and ``y`` swapped:
+For pencil decompositions (useful for three-dimensional problems), there are three forward transforms, 
+three backward transforms, and four transpositions that require MPI communication. 
+In the algorithm below, the first dimension is always the local dimension. In our implementation we require
+`Nz ≥ Ry` and `Nx ≥ Ry` with the additional constraint that `Nz % Ry = 0` and `Ny % Rx = 0`.
 
-1. `first(storage)` is initialized with layout ``(z, x, y)``.
+1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs` that is ``b``.
 2. Transform along ``z``.
-3  Transpose + communicate to `storage[2]` in layout ``(x, z, y)``,
-   which is distributed into `(Rx, Ry)` processes in ``(z, y)``.
-4. Transform along ``x``.
-5. Transpose + communicate to `last(storage)` in layout ``(y, x, z)``,
-   which is distributed into `(Rx, Ry)` processes in ``(x, z)``.
-6. Transform in ``y``.
+3  Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+4. Transform along ``y``.
+5. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``.
+6. Transform in ``x``.
 
 At this point the three in-place forward transforms are complete, and we
-solve the Poisson equation by updating `last(storage)`.
-Then the process is reversed to obtain `first(storage)` in physical
-space and with the layout ``(z, x, y)``.
+solve the Poisson equation by updating `storage.xfield`.
+Then the process is reversed to obtain `storage.zfield` in physical
+space partitioned over ``(x, y)``.
 
-Restrictions
-============
+Algorithm for stencil decompositions
+====================================
 
-The algorithm for two-dimensional decompositions requires that `Nz = size(global_grid, 3)` is larger
-than either `Rx = ranks[1]` or `Ry = ranks[2]`, where `ranks` are configured when building `Distributed`.
-If `Nz` does not satisfy this condition, we can only support a one-dimensional decomposition.
+The stecil decomposition algorithm works in the same manner as the pencil decompostion described above
+while skipping the transposes that are not required. For example if the domain is decomposed in ``x``, 
+step 3 in the above algorithm is skipped (and the associated transposition step in the bakward transform)
 
-Algorithm for one-dimensional decompositions
-============================================
+Restrictions
+============
 
-This algorithm requires a one-dimensional decomposition with _either_ `Rx = 1`
-_or_ `Ry = 1`, and is important to support two-dimensional transforms.
+1. Pencil decomopositions:
+    - `Ny ≥ Rx` and `Ny % Rx = 0`
+    - `Nz ≥ Ry` and `Nz % Ry = 0`
+    - If the ``z`` direction is `Periodic`, also the ``y`` and the ``x`` directions must be `Periodic`
+    - If the ``y`` direction is `Periodic`, also the ``x`` direction must be `Periodic`
 
-For one-dimensional decompositions, we place the decomposed direction _last_.
-If the number of ranks is `Rh = max(Rx, Ry)`, this algorithm requires that 
-_both_ `Nx > Rh` _and_ `Ny > Rh`. The resulting flow of transposes and transforms
-is similar to the two-dimensional case. It remains somewhat of a mystery why this
-succeeds (i.e., why the last transform is correctly decomposed).
+2. Stencil decomposition:
+    - same as for pencil decompositions with `Rx` (or `Ry`) equal to one
 """
-function DistributedFFTBasedPoissonSolver(global_grid, local_grid)
+function DistributedFFTBasedPoissonSolver(global_grid, local_grid, planner_flag=FFTW.PATIENT)
 
-    arch = architecture(local_grid)
-    Rx, Ry, Rz = arch.ranks
-    communicator = arch.communicator
+    validate_poisson_solver_distributed_grid(global_grid)
+    validate_poisson_solver_configuration(global_grid, local_grid)
 
-    # We don't support distributing anything in z.
-    Rz == 1 || throw(ArgumentError("Non-singleton ranks in the vertical are not supported by DistributedFFTBasedPoissonSolver."))
+    FT = Complex{eltype(local_grid)}
+
+    storage = TransposableField(CenterField(local_grid), FT)
 
-    gNx, gNy, gNz = size(global_grid)
+    arch = architecture(storage.xfield.grid)
+    child_arch = child_architecture(arch)
 
     # Build _global_ eigenvalues
     topo = (TX, TY, TZ) = topology(global_grid)
-    λx = poisson_eigenvalues(global_grid.Nx, global_grid.Lx, 1, TX())
-    λy = poisson_eigenvalues(global_grid.Ny, global_grid.Ly, 2, TY())
-    λz = poisson_eigenvalues(global_grid.Nz, global_grid.Lz, 3, TZ())
-
-    # Drop singleton dimensions for compatibility with PencilFFTs' localgrid
-    λx = dropdims(λx, dims=(2, 3))
-    λy = dropdims(λy, dims=(1, 3))
-    λz = dropdims(λz, dims=(1, 2))
-
-    unpermuted_eigenvalues = (λx, λy, λz)
-
-    # First we check if we can do a two-dimensional decomposition
-    if gNz >= Rx 
-        input_permutation = Permutation(3, 1, 2)
-        permuted_size = (gNz, gNx, gNy)
-        processors_per_dimension = (Rx, Ry)
-    elseif gNz >= Ry
-        input_permutation = Permutation(3, 2, 1)
-        permuted_size = (gNz, gNy, gNx)
-        processors_per_dimension = (Ry, Rx)
-
-    else # it has to be a one-dimensional decomposition
-
-        Rx > 1 && Ry > 1 &&
-            throw(ArgumentError("DistributedFFTBasedPoissonSolver requires either " *
-                                "(i) Nz > Rx, (ii) Nz > Ry, (iii) Rx = 1, _or_ (iv) Ry = 1."))
-
-        if Rx == 1 # x-local, y-distributed
-            permuted_size = (gNz, gNx, gNy)
-            input_permutation = Permutation(3, 1, 2)
-            processors_per_dimension = (1, Ry)
-        else # Ry == 1, y-local, x-distributed
-            permuted_size = (gNz, gNy, gNx)
-            input_permutation = Permutation(3, 2, 1)
-            processors_per_dimension = (1, Rx)
-        end
-    end
-
-    transforms = Tuple(infer_transform(global_grid, d) for d in Tuple(input_permutation))
-    plan = PencilFFTs.PencilFFTPlan(permuted_size, transforms, processors_per_dimension, communicator)
-
-    # Allocate memory for in-place FFT + transpositions
-    storage = PencilFFTs.allocate_input(plan)
-
-    # Permute the λ appropriately
-    permuted_eigenvalues = Tuple(unpermuted_eigenvalues[d] for d in Tuple(input_permutation))
-    eigenvalues = PencilFFTs.localgrid(last(storage), permuted_eigenvalues)
-
-    return DistributedFFTBasedPoissonSolver(plan, global_grid, local_grid, eigenvalues, storage, input_permutation)
+    λx = dropdims(poisson_eigenvalues(global_grid.Nx, global_grid.Lx, 1, TX()), dims=(2, 3))
+    λy = dropdims(poisson_eigenvalues(global_grid.Ny, global_grid.Ly, 2, TY()), dims=(1, 3))
+    λz = dropdims(poisson_eigenvalues(global_grid.Nz, global_grid.Lz, 3, TZ()), dims=(1, 2))
+        
+    λx = partition_coordinate(λx, size(storage.xfield.grid, 1), arch, 1)
+    λy = partition_coordinate(λy, size(storage.xfield.grid, 2), arch, 2)
+    λz = partition_coordinate(λz, size(storage.xfield.grid, 3), arch, 3)
+
+    λx = on_architecture(child_arch, λx)
+    λy = on_architecture(child_arch, λy)
+    λz = on_architecture(child_arch, λz)
+
+    eigenvalues = (λx, λy, λz)
+
+    plan = plan_distributed_transforms(global_grid, storage, planner_flag)
+    
+    # We need to permute indices to apply bounded transforms on the GPU (r2r of r2c with twiddling)
+    x_buffer_needed = child_arch isa GPU && TX == Bounded
+    z_buffer_needed = child_arch isa GPU && TZ == Bounded 
+    
+    # We cannot really batch anything, so on GPUs we always have to permute indices in the y direction
+    y_buffer_needed = child_arch isa GPU
+
+    buffer_x = x_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.xfield)...)) : nothing
+    buffer_y = y_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.yfield)...)) : nothing 
+    buffer_z = z_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.zfield)...)) : nothing
+
+    buffer = (; x = buffer_x, y = buffer_y, z = buffer_z)
+
+    return DistributedFFTBasedPoissonSolver(plan, global_grid, local_grid, eigenvalues, buffer, storage)
 end
 
-# solve! requires that `b` in `A x = b` (the right hand side)
-# was computed and stored in first(solver.storage) prior to calling `solve!(x, solver)`.
+# solve! requires that `b` in `A x = b` (the right hand side) 
+# is copied in the solver storage
 # See: Models/NonhydrostaticModels/solve_for_pressure.jl
 function solve!(x, solver::DistributedFFTBasedPoissonSolver)
-    arch = architecture(solver.global_grid)
-    multi_arch = architecture(solver.local_grid)
+    storage = solver.storage
+    buffer  = solver.buffer
+    arch    = architecture(storage.xfield.grid)
 
     # Apply forward transforms to b = first(solver.storage).
-    solver.plan * solver.storage
-
+    solver.plan.forward.z!(parent(storage.zfield), buffer.z)
+    transpose_z_to_y!(storage) # copy data from storage.zfield to storage.yfield
+    solver.plan.forward.y!(parent(storage.yfield), buffer.y) 
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+    solver.plan.forward.x!(parent(storage.xfield), buffer.x)
+    
     # Solve the discrete Poisson equation in wavenumber space
     # for x̂. We solve for x̂ in place, reusing b̂.
-    x̂ = b̂ = last(solver.storage)
     λ = solver.eigenvalues
-    @. x̂ = - b̂ / (λ[1] + λ[2] + λ[3])
+    x̂ = b̂ = parent(storage.xfield)
+    
+    launch!(arch, storage.xfield.grid, :xyz, _solve_poisson_in_spectral_space!, x̂, b̂, λ[1], λ[2], λ[3])
 
     # Set the zeroth wavenumber and volume mean, which are undetermined
     # in the Poisson equation, to zero.
-    if MPI.Comm_rank(multi_arch.communicator) == 0
-        # This is an assumption: we *hope* PencilArrays allocates in this way
-        parent(x̂)[1, 1, 1] = 0
+    if arch.local_rank == 0
+        @allowscalar x̂[1, 1, 1] = 0
     end
 
-    # Apply backward transforms to x̂ = last(solver.storage).
-    solver.plan \ solver.storage
-
-    # xc is the backward transform of x̂.
-    xc = first(solver.storage)
+    # Apply backward transforms to x̂ = parent(storage.xfield).
+    solver.plan.backward.x!(parent(storage.xfield), buffer.x)
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+    solver.plan.backward.y!(parent(storage.yfield), buffer.y)
+    transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield
+    solver.plan.backward.z!(parent(storage.zfield), buffer.z) # last backwards transform is in z
 
     # Copy the real component of xc to x.
     launch!(arch, solver.local_grid, :xyz,
-            copy_permuted_real_component!, x, parent(xc), solver.input_permutation)
+            _copy_real_component!, x, parent(storage.zfield))
 
     return x
 end
 
-const ZXYPermutation = Permutation{(3, 1, 2), 3}
-const ZYXPermutation = Permutation{(3, 2, 1), 3}
-
-@kernel function copy_permuted_real_component!(ϕ, ϕc, ::ZXYPermutation)
+@kernel function _solve_poisson_in_spectral_space!(x̂, b̂, λx, λy, λz)
     i, j, k = @index(Global, NTuple)
-    @inbounds ϕ[i, j, k] = real(ϕc[k, i, j])
+    @inbounds x̂[i, j, k] = - b̂[i, j, k] / (λx[i] + λy[j] + λz[k])
 end
 
-@kernel function copy_permuted_real_component!(ϕ, ϕc, ::ZYXPermutation)
+@kernel function _copy_real_component!(ϕ, ϕc)
     i, j, k = @index(Global, NTuple)
-    @inbounds ϕ[i, j, k] = real(ϕc[k, j, i])
+    @inbounds ϕ[i, j, k] = real(ϕc[i, j, k])
 end
 
+# TODO: bring up to speed the PCG to remove this error
+validate_poisson_solver_distributed_grid(global_grid) = 
+        throw("Grids other than the RectilinearGrid are not supported in the Distributed NonhydrostaticModels")
+
+function validate_poisson_solver_distributed_grid(global_grid::RectilinearGrid) 
+    TX, TY, TZ = topology(global_grid)
+
+    if (TY == Bounded && TZ == Periodic) || (TX == Bounded && TY == Periodic) || (TX == Bounded && TZ == Periodic)
+        throw("Distributed Poisson solvers do not support grids with topology ($TX, $TY, $TZ) at the moment.
+               A Periodic z-direction requires also the y- and and x-directions to be Periodic, while a Periodic y-direction requires also 
+               the x-direction to be Periodic.")
+    end
+    
+    if !(global_grid isa YZRegularRG) && !(global_grid isa XYRegularRG) && !(global_grid isa XZRegularRG) 
+        throw("The provided grid is stretched in directions $(stretched_dimensions(global_grid)). 
+               A distributed Poisson solver supports only RectilinearGrids that have variably-spaced cells in at most one direction.")
+    end
+
+    return nothing
+end
+
+function validate_poisson_solver_configuration(global_grid, local_grid)
+        
+    # We don't support distributing anything in z.
+    Rx, Ry, Rz = architecture(local_grid).ranks
+    Rz == 1 || throw("Non-singleton ranks in the vertical are not supported by distributed Poisson solvers.")
+    
+    # Limitation of the current implementation (see the docstring)
+    if global_grid.Nz % Ry != 0
+        throw("The number of ranks in the y-direction are $(Ry) with Nz = $(global_grid.Nz) cells in the z-direction.
+               The distributed Poisson solver requires that the number of ranks in the y-direction divide Nz.")
+    end
+
+    if global_grid.Ny % Rx != 0
+        throw("The number of ranks in the y-direction are $(Rx) with Ny = $(global_grid.Ny) cells in the y-direction.
+               The distributed Poisson solver requires that the number of ranks in the x-direction divide Ny.")
+    end
+
+    return nothing
+end
diff --git a/src/DistributedComputations/distributed_fft_tridiagonal_solver.jl b/src/DistributedComputations/distributed_fft_tridiagonal_solver.jl
new file mode 100644
index 0000000000..8911cb98d8
--- /dev/null
+++ b/src/DistributedComputations/distributed_fft_tridiagonal_solver.jl
@@ -0,0 +1,352 @@
+using CUDA: @allowscalar
+using Oceananigans.Grids: stretched_dimensions
+using Oceananigans.Grids: XDirection, YDirection
+using Oceananigans.Operators: Δxᶠᵃᵃ, Δyᵃᶠᵃ, Δzᵃᵃᶠ
+
+using Oceananigans.Solvers: BatchedTridiagonalSolver, 
+                            stretched_direction, 
+                            ZTridiagonalSolver, 
+                            YTridiagonalSolver, 
+                            XTridiagonalSolver,
+                            compute_main_diagonal!
+
+struct DistributedFourierTridiagonalPoissonSolver{G, L, B, P, R, S, β} 
+    plan :: P              
+    global_grid :: G
+    local_grid :: L
+    batched_tridiagonal_solver :: B
+    source_term :: R
+    storage :: S
+    buffer :: β 
+end
+
+# Usefull aliases for dispatch...
+const XStretchedDistributedSolver = DistributedFourierTridiagonalPoissonSolver{<:Any, <:Any, <:XTridiagonalSolver}
+const YStretchedDistributedSolver = DistributedFourierTridiagonalPoissonSolver{<:Any, <:Any, <:YTridiagonalSolver}
+const ZStretchedDistributedSolver = DistributedFourierTridiagonalPoissonSolver{<:Any, <:Any, <:ZTridiagonalSolver}
+
+architecture(solver::DistributedFourierTridiagonalPoissonSolver) =
+    architecture(solver.global_grid)
+
+@inline Δξᶠ(i, grid, ::Val{1}) = Δxᶠᵃᵃ(i, 1, 1, grid)
+@inline Δξᶠ(j, grid, ::Val{2}) = Δyᵃᶠᵃ(1, j, 1, grid)
+@inline Δξᶠ(k, grid, ::Val{3}) = Δzᵃᵃᶠ(1, 1, k, grid)
+    
+"""
+    DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid)
+
+Return an FFT-based solver for the Poisson equation evaluated on a `local_grid` that has a non-uniform
+spacing in exactly one direction (i.e. either in x, y or z)
+
+```math
+∇²φ = b
+```
+
+for `Distributed` architectures.
+
+Supported configurations
+========================
+
+In the following, `Nx`, `Ny`, and `Nz` are the number of grid points of the **global** grid
+in the `x`, `y`, and `z` directions, while `Rx`, `Ry`, and `Rz` are the number of ranks in the
+`x`, `y`, and `z` directions, respectively. Furthermore, 'pencil' decomposition refers to a domain 
+decomposed in two different directions (i.e., with `Rx != 1` and `Ry != 1`), while 'slab' decomposition 
+refers to a domain decomposed only in one direction, (i.e., with either `Rx == 1` or `Ry == 1`).
+Additionally, `storage` indicates the `TransposableField` used for storing intermediate results;
+see [`TransposableField`](@ref).
+
+1. Three dimensional configurations with 'pencil' decompositions in ``(x, y)`` 
+where `Ny ≥ Rx` and `Ny % Rx = 0`, and `Nz ≥ Ry` and `Nz % Ry = 0`.
+
+2. Two dimensional configurations decomposed in ``x`` where `Ny ≥ Rx` and `Ny % Rx = 0`
+    
+!!! warning "Unsupported decompositions"
+    _Any_ configuration decomposed in ``z`` direction is _not_ supported.
+    Furthermore, any ``(x, y)`` decompositions other than the configurations mentioned above are also _not_ supported.
+    
+Algorithm for pencil decompositions
+============================================
+
+For pencil decompositions (useful for three-dimensional problems),
+there are two forward transforms, two backward transforms, one tri-diagonal matrix inversion
+and a variable number of transpositions that require MPI communication, dependent on the 
+stretched direction:
+
+- a stretching in the x-direction requires four transpositions
+- a stretching in the y-direction requires six transpositions
+- a stretching in the z-direction requires eight transpositions
+
+!!! note "Computational cost"
+    Because of the additional transpositions, a stretching in the x-direction
+    is computationally cheaper than a stretching in the y-direction, and the latter
+    is cheaper than a stretching in the z-direction
+
+In our implementation we require `Nz ≥ Ry` and `Nx ≥ Ry` with the additional constraint 
+that `Nz % Ry = 0` and `Ny % Rx = 0`.
+
+x - stretched algorithm
+========================
+
+1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs`.
+2. Transform along ``z``.
+3. Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+4. Transform along ``y``.
+5. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``.
+6. Solve the tri-diagonal linear system in the ``x`` direction.
+
+Steps 5 -> 1 are reversed to obtain the result in physical space stored in `storage.zfield` 
+partitioned over ``(x, y)``.
+
+y - stretched algorithm
+========================
+
+1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs`.
+2. Transform along ``z``.
+3. Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+4. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``.
+5. Transform along ``x``.
+6. Transpose `storage.xfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+7. Solve the tri-diagonal linear system in the ``y`` direction.
+
+Steps 6 -> 1 are reversed to obtain the result in physical space stored in `storage.zfield` 
+partitioned over ``(x, y)``.
+
+z - stretched algorithm
+========================
+
+1. `storage.zfield`, partitioned over ``(x, y)`` is initialized with the `rhs`.
+2. Transpose `storage.zfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+3. Transform along ``y``.
+4. Transpose `storage.yfield` + communicate to `storage.xfield` partitioned into `(Rx, Ry)` processes in ``(y, z)``.
+5. Transform along ``x``.
+6. Transpose `storage.xfield` + communicate to `storage.yfield` partitioned into `(Rx, Ry)` processes in ``(x, z)``.
+7. Transpose `storage.yfield` + communicate to `storage.zfield` partitioned into `(Rx, Ry)` processes in ``(x, y)``.
+8. Solve the tri-diagonal linear system in the ``z`` direction.
+
+Steps 7 -> 1 are reversed to obtain the result in physical space stored in `storage.zfield` 
+partitioned over ``(x, y)``.
+
+Algorithm for slab decompositions
+=============================
+
+The 'slab' decomposition works in the same manner while skipping the transposes that
+are not required. For example if the domain is decomposed in ``x``, step 4. and 6. in the above algorithm
+are skipped (and the associated reversed step in the backward transform)
+
+Restrictions
+============
+
+1. Pencil decompositions:
+    - `Ny ≥ Rx` and `Ny % Rx = 0`
+    - `Nz ≥ Ry` and `Nz % Ry = 0`
+    - If the ``z`` direction is `Periodic`, also the ``y`` and the ``x`` directions must be `Periodic`
+    - If the ``y`` direction is `Periodic`, also the ``x`` direction must be `Periodic`
+
+2. Slab decomposition:
+    - Same as for two-dimensional decompositions with `Rx` (or `Ry`) equal to one
+
+"""
+function DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid, planner_flag=FFTW.PATIENT; tridiagonal_direction = nothing)
+    
+    validate_poisson_solver_distributed_grid(global_grid)
+    validate_poisson_solver_configuration(global_grid, local_grid)
+ 
+    if isnothing(tridiagonal_direction) 
+        tridiagonal_dim = stretched_dimensions(local_grid)[1]
+        tridiagonal_direction = stretched_direction(local_grid)
+    else
+        tridiagonal_dim = tridiagonal_direction == XDirection() ? 1 : 
+                          tridiagonal_direction == YDirection() ? 2 : 3
+    end
+
+    topology(global_grid, tridiagonal_dim) != Bounded &&
+        error("`DistributedFourierTridiagonalPoissonSolver` requires that the stretched direction (dimension $tridiagonal_dim) is `Bounded`.")
+
+    FT         = Complex{eltype(local_grid)}
+    child_arch = child_architecture(local_grid)
+    storage    = TransposableField(CenterField(local_grid), FT)
+
+    topo = (TX, TY, TZ) = topology(global_grid)
+    λx = dropdims(poisson_eigenvalues(global_grid.Nx, global_grid.Lx, 1, TX()), dims=(2, 3))
+    λy = dropdims(poisson_eigenvalues(global_grid.Ny, global_grid.Ly, 2, TY()), dims=(1, 3))
+    λz = dropdims(poisson_eigenvalues(global_grid.Nz, global_grid.Lz, 3, TZ()), dims=(1, 2))
+        
+    if tridiagonal_dim == 1
+        arch = architecture(storage.xfield.grid)
+        grid = storage.xfield.grid
+        λ1 = partition_coordinate(λy, size(grid, 2), arch, 2)
+        λ2 = partition_coordinate(λz, size(grid, 3), arch, 3)
+    elseif tridiagonal_dim == 2
+        arch = architecture(storage.yfield.grid)
+        grid = storage.yfield.grid
+        λ1 = partition_coordinate(λx, size(grid, 1), arch, 1)
+        λ2 = partition_coordinate(λz, size(grid, 3), arch, 3)
+    elseif tridiagonal_dim == 3
+        arch = architecture(storage.zfield.grid)
+        grid = storage.zfield.grid
+        λ1 = partition_coordinate(λx, size(grid, 1), arch, 1)
+        λ2 = partition_coordinate(λy, size(grid, 2), arch, 2)
+    end
+
+    λ1 = on_architecture(child_arch, λ1)
+    λ2 = on_architecture(child_arch, λ2)
+
+    plan = plan_distributed_transforms(global_grid, storage, planner_flag)
+
+    # Lower and upper diagonals are the same
+    lower_diagonal = @allowscalar [ 1 / Δξᶠ(q, grid, Val(tridiagonal_dim)) for q in 2:size(grid, tridiagonal_dim) ]
+    lower_diagonal = on_architecture(child_arch, lower_diagonal)
+    upper_diagonal = lower_diagonal
+
+    # Compute diagonal coefficients for each grid point
+    diagonal = zeros(eltype(grid), size(grid)...)
+    diagonal = on_architecture(arch, diagonal)
+    launch_config = if tridiagonal_dim == 1
+                        :yz
+                    elseif tridiagonal_dim == 2
+                        :xz
+                    elseif tridiagonal_dim == 3
+                        :xy
+                    end
+
+    launch!(arch, grid, launch_config, compute_main_diagonal!, diagonal, grid, λ1, λ2, tridiagonal_direction)
+
+    # Set up batched tridiagonal solver
+    btsolver = BatchedTridiagonalSolver(grid; lower_diagonal, diagonal, upper_diagonal, tridiagonal_direction)
+    
+    # We need to permute indices to apply bounded transforms on the GPU (r2r of r2c with twiddling)
+    x_buffer_needed = child_arch isa GPU && TX == Bounded
+    z_buffer_needed = child_arch isa GPU && TZ == Bounded 
+    
+    # We cannot really batch anything, so on GPUs we always have to permute indices in the y direction
+    y_buffer_needed = child_arch isa GPU
+
+    buffer_x = x_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.xfield)...)) : nothing
+    buffer_y = y_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.yfield)...)) : nothing 
+    buffer_z = z_buffer_needed ? on_architecture(child_arch, zeros(FT, size(storage.zfield)...)) : nothing
+
+    buffer = if tridiagonal_dim == 1
+        (; y = buffer_y, z = buffer_z)
+    elseif tridiagonal_dim == 2
+        (; x = buffer_x, z = buffer_z)
+    elseif tridiagonal_dim == 3
+        (; x = buffer_x, y = buffer_y)
+    end
+
+    if tridiagonal_dim == 1
+        forward  = (y! = plan.forward.y!,  z! = plan.forward.z!)
+        backward = (y! = plan.backward.y!, z! = plan.backward.z!)
+    elseif tridiagonal_dim == 2
+        forward  = (x! = plan.forward.x!,  z! = plan.forward.z!)
+        backward = (x! = plan.backward.x!, z! = plan.backward.z!)
+    elseif tridiagonal_dim == 3
+        forward  = (x! = plan.forward.x!,  y! = plan.forward.y!)
+        backward = (x! = plan.backward.x!, y! = plan.backward.y!)
+    end
+
+    plan = (; forward, backward)
+
+    # Storage space for right hand side of Poisson equation
+    T = complex(eltype(grid))
+    source_term = zeros(T, size(grid)...)
+    source_term = on_architecture(arch, source_term)
+
+    return DistributedFourierTridiagonalPoissonSolver(plan, global_grid, local_grid, btsolver, source_term, storage, buffer)
+end
+
+# solve! requires that `b` in `A x = b` (the right hand side) 
+# is copied in the solver storage
+# See: Models/NonhydrostaticModels/solve_for_pressure.jl
+function solve!(x, solver::ZStretchedDistributedSolver)
+    arch    = architecture(solver)
+    storage = solver.storage
+    buffer  = solver.buffer
+
+    transpose_z_to_y!(storage) # copy data from storage.zfield to storage.yfield
+    solver.plan.forward.y!(parent(storage.yfield), buffer.y) 
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+    solver.plan.forward.x!(parent(storage.xfield), buffer.x)
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+    transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield
+  
+    # copy results in the source term
+    parent(solver.source_term) .= parent(storage.zfield)
+
+    # Perform the implicit vertical solve here on storage.zfield...
+    # Solve tridiagonal system of linear equations at every z-column.
+    solve!(storage.zfield, solver.batched_tridiagonal_solver, solver.source_term)
+
+    transpose_z_to_y!(storage)
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+    solver.plan.backward.x!(parent(storage.xfield), buffer.x)
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+    solver.plan.backward.y!(parent(storage.yfield), buffer.y)
+    transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield
+
+    # Copy the real component of xc to x.
+    launch!(arch, solver.local_grid, :xyz,
+            _copy_real_component!, x, parent(storage.zfield))
+
+    return x
+end
+
+function solve!(x, solver::YStretchedDistributedSolver)
+    arch    = architecture(solver)
+    storage = solver.storage
+    buffer  = solver.buffer
+
+    solver.plan.forward.z!(parent(storage.zfield), buffer.z)
+    transpose_z_to_y!(storage) # copy data from storage.zfield to storage.yfield
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+    solver.plan.forward.x!(parent(storage.xfield), buffer.x)
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+  
+    # copy results in the source term
+    parent(solver.source_term) .= parent(storage.yfield)
+
+    # Perform the implicit vertical solve here on storage.yfield...
+    # Solve tridiagonal system of linear equations at every y-column.
+    solve!(storage.yfield, solver.batched_tridiagonal_solver, solver.source_term)
+
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+    solver.plan.backward.x!(parent(storage.xfield), buffer.x)
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+    transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield
+    solver.plan.backward.z!(parent(storage.zfield), buffer.z)
+
+    # Copy the real component of xc to x.
+    launch!(arch, solver.local_grid, :xyz,
+            _copy_real_component!, x, parent(storage.zfield))
+
+    return x
+end
+
+function solve!(x, solver::XStretchedDistributedSolver)
+    arch    = architecture(solver)
+    storage = solver.storage
+    buffer  = solver.buffer
+
+    # Apply forward transforms to b = first(solver.storage).
+    solver.plan.forward.z!(parent(storage.zfield), buffer.z)
+    transpose_z_to_y!(storage) # copy data from storage.zfield to storage.yfield
+    solver.plan.forward.y!(parent(storage.yfield), buffer.y) 
+    transpose_y_to_x!(storage) # copy data from storage.yfield to storage.xfield
+
+    # copy results in the source term
+    parent(solver.source_term) .= parent(storage.xfield)
+
+    # Perform the implicit vertical solve here on storage.xfield...
+    # Solve tridiagonal system of linear equations at every x-column.
+    solve!(storage.xfield, solver.batched_tridiagonal_solver, solver.source_term)
+
+    transpose_x_to_y!(storage) # copy data from storage.xfield to storage.yfield
+    solver.plan.backward.y!(parent(storage.yfield), buffer.y)
+    transpose_y_to_z!(storage) # copy data from storage.yfield to storage.zfield
+    solver.plan.backward.z!(parent(storage.zfield), buffer.z) # last backwards transform is in z
+
+    # Copy the real component of xc to x.
+    launch!(arch, solver.local_grid, :xyz,
+            _copy_real_component!, x, parent(storage.zfield))
+
+    return x
+end
\ No newline at end of file
diff --git a/src/DistributedComputations/distributed_fields.jl b/src/DistributedComputations/distributed_fields.jl
index 62b7fca628..17f5ef930e 100644
--- a/src/DistributedComputations/distributed_fields.jl
+++ b/src/DistributedComputations/distributed_fields.jl
@@ -1,16 +1,20 @@
-import Oceananigans.Fields: Field, FieldBoundaryBuffers, location, set!
-import Oceananigans.BoundaryConditions: fill_halo_regions!
-
 using CUDA: CuArray
+using OffsetArrays: OffsetArray
 using Oceananigans.Grids: topology
-using Oceananigans.Fields: validate_field_data, indices, validate_boundary_conditions, validate_indices, recv_from_buffers!
+using Oceananigans.Fields: validate_field_data, indices, validate_boundary_conditions
+using Oceananigans.Fields: validate_indices, recv_from_buffers!, set_to_array!, set_to_field!
+
+import Oceananigans.Fields: Field, FieldBoundaryBuffers, location, set!
+import Oceananigans.BoundaryConditions: fill_halo_regions!
 
 function Field((LX, LY, LZ)::Tuple, grid::DistributedGrid, data, old_bcs, indices::Tuple, op, status)
-    arch = architecture(grid)
     indices = validate_indices(indices, (LX, LY, LZ), grid)
     validate_field_data((LX, LY, LZ), data, grid, indices)
     validate_boundary_conditions((LX, LY, LZ), grid, old_bcs)
-    new_bcs = inject_halo_communication_boundary_conditions(old_bcs, arch.local_rank, arch.connectivity, topology(grid))
+
+    arch = architecture(grid)
+    rank = arch.local_rank
+    new_bcs = inject_halo_communication_boundary_conditions(old_bcs, rank, arch.connectivity, topology(grid))
     buffers = FieldBoundaryBuffers(grid, data, new_bcs)
 
     return Field{LX, LY, LZ}(grid, data, new_bcs, indices, op, status, buffers)
@@ -19,42 +23,36 @@ end
 const DistributedField      = Field{<:Any, <:Any, <:Any, <:Any, <:DistributedGrid}
 const DistributedFieldTuple = NamedTuple{S, <:NTuple{N, DistributedField}} where {S, N}
 
-function set!(u::DistributedField, f::Function)
-    arch = architecture(u)
-    if child_architecture(arch) isa GPU
-        cpu_grid = on_architecture(cpu_architecture(arch), u.grid)
-        u_cpu = Field(location(u), cpu_grid; indices = indices(u))
-        f_field = field(location(u), f, cpu_grid)
-        set!(u_cpu, f_field)
-        set!(u, u_cpu)
-    elseif child_architecture(arch) isa CPU
-        f_field = field(location(u), f, u.grid)
-        set!(u, f_field)
+global_size(f::DistributedField) = global_size(architecture(f), size(f))
+
+# Automatically partition under the hood if sizes are compatible
+function set!(u::DistributedField, V::Union{Array, CuArray, OffsetArray})
+    NV = size(V)
+    Nu = global_size(u)
+
+    # Suppress singleton indices
+    NV′ = filter(n -> n > 1, NV)
+    Nu′ = filter(n -> n > 1, Nu)
+
+    if NV′ == Nu′
+        v = partition(V, u)
+    else
+        v = V
     end
 
-    return u
+    return set_to_array!(u, v)
 end
 
-# Automatically partition under the hood if sizes are compatible
-function set!(u::DistributedField, v::Union{Array, CuArray})
-    gsize = global_size(architecture(u), size(u))
-
-    if size(v) == gsize
-        f = partition_global_array(architecture(u), v, size(u))
-        u .= f
-        return u
+function set!(u::DistributedField, V::Field)
+    if size(V) == global_size(u)
+        v = partition(V, u)
+        return set_to_array!(u, v)
     else
-        try
-            f = on_architecture(architecture(u), v)
-            u .= f
-            return u
-    
-        catch
-            throw(ArgumentError("ERROR: DimensionMismatch: array could not be set to match destination field"))
-        end
+        return set_to_field!(u, V)
     end
 end
 
+
 """
     synchronize_communication!(field)
 
@@ -69,12 +67,32 @@ function synchronize_communication!(field)
 
         # Reset MPI tag
         arch.mpi_tag[] -= arch.mpi_tag[]
-    
+
         # Reset MPI requests
         empty!(arch.mpi_requests)
     end
-    
+
     recv_from_buffers!(field.data, field.boundary_buffers, field.grid)
-    
+
     return nothing
 end
+
+# Fallback
+reconstruct_global_field(field) = field
+
+"""
+    reconstruct_global_field(field::DistributedField)
+
+Reconstruct a global field from a local field by combining the data from all processes.
+"""
+function reconstruct_global_field(field::DistributedField)
+    global_grid = reconstruct_global_grid(field.grid)
+    global_field = Field(location(field), global_grid)
+    arch = architecture(field)
+
+    global_data = construct_global_array(arch, interior(field), size(field))
+
+    set!(global_field, global_data)
+
+    return global_field
+end
diff --git a/src/DistributedComputations/distributed_grids.jl b/src/DistributedComputations/distributed_grids.jl
index 90a3d4b80a..1b52477149 100644
--- a/src/DistributedComputations/distributed_grids.jl
+++ b/src/DistributedComputations/distributed_grids.jl
@@ -1,7 +1,7 @@
 using MPI
 using OffsetArrays
 using Oceananigans.Utils: getnamewrapper
-using Oceananigans.Grids: topology, size, halo_size, architecture, pop_flat_elements
+using Oceananigans.Grids: AbstractGrid, topology, size, halo_size, architecture, pop_flat_elements
 using Oceananigans.Grids: validate_rectilinear_grid_args, validate_lat_lon_grid_args, validate_size
 using Oceananigans.Grids: generate_coordinate, with_precomputed_metrics
 using Oceananigans.Grids: cpu_face_constructor_x, cpu_face_constructor_y, cpu_face_constructor_z
@@ -84,7 +84,6 @@ function RectilinearGrid(arch::Distributed,
     xl = Rx == 1 ? x : partition_coordinate(x, nx, arch, 1)
     yl = Ry == 1 ? y : partition_coordinate(y, ny, arch, 2)
     zl = Rz == 1 ? z : partition_coordinate(z, nz, arch, 3)
-
     Lx, xᶠᵃᵃ, xᶜᵃᵃ, Δxᶠᵃᵃ, Δxᶜᵃᵃ = generate_coordinate(FT, topology[1](), nx, Hx, xl, :x, child_architecture(arch))
     Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, topology[2](), ny, Hy, yl, :y, child_architecture(arch))
     Lz, zᵃᵃᶠ, zᵃᵃᶜ, Δzᵃᵃᶠ, Δzᵃᵃᶜ = generate_coordinate(FT, topology[3](), nz, Hz, zl, :z, child_architecture(arch))
@@ -160,6 +159,9 @@ function LatitudeLongitudeGrid(arch::Distributed,
     return !precompute_metrics ? preliminary_grid : with_precomputed_metrics(preliminary_grid)
 end
 
+# Fallback!
+reconstruct_global_grid(grid::AbstractGrid) = grid
+
 """
     reconstruct_global_grid(grid::DistributedGrid)
 
@@ -174,7 +176,7 @@ function reconstruct_global_grid(grid::DistributedRectilinearGrid)
 
     nx, ny, nz = n = size(grid)
     Hx, Hy, Hz = H = halo_size(grid)
-    Nx, Ny, Nz = map(sum, concatenate_local_sizes(n, arch))
+    Nx, Ny, Nz = global_size(arch, n)
 
     TX, TY, TZ = topology(grid)
 
@@ -187,9 +189,9 @@ function reconstruct_global_grid(grid::DistributedRectilinearGrid)
     z = cpu_face_constructor_z(grid)
 
     ## This will not work with 3D parallelizations!!
-    xG = Rx == 1 ? x : assemble_coordinate(x, nx, Rx, ri, rj, rk, arch.communicator)
-    yG = Ry == 1 ? y : assemble_coordinate(y, ny, Ry, rj, ri, rk, arch.communicator)
-    zG = Rz == 1 ? z : assemble_coordinate(z, nz, Rz, rk, ri, rj, arch.communicator)
+    xG = Rx == 1 ? x : assemble_coordinate(x, nx, arch, 1)
+    yG = Ry == 1 ? y : assemble_coordinate(y, ny, arch, 2)
+    zG = Rz == 1 ? z : assemble_coordinate(z, nz, arch, 3)
 
     child_arch = child_architecture(arch)
 
@@ -217,7 +219,7 @@ function reconstruct_global_grid(grid::DistributedLatitudeLongitudeGrid)
 
     nλ, nφ, nz = n = size(grid)
     Hλ, Hφ, Hz = H = halo_size(grid)
-    Nλ, Nφ, Nz = map(sum, concatenate_local_sizes(n, arch))
+    Nλ, Nφ, Nz = global_size(arch, n)
 
     TX, TY, TZ = topology(grid)
 
@@ -230,9 +232,9 @@ function reconstruct_global_grid(grid::DistributedLatitudeLongitudeGrid)
     z = cpu_face_constructor_z(grid)
 
     ## This will not work with 3D parallelizations!!
-    λG = Rx == 1 ? λ : assemble_coordinate(λ, nλ, Rx, ri, rj, rk, arch.communicator)
-    φG = Ry == 1 ? φ : assemble_coordinate(φ, nφ, Ry, rj, ri, rk, arch.communicator)
-    zG = Rz == 1 ? z : assemble_coordinate(z, nz, Rz, rk, ri, rj, arch.communicator)
+    λG = Rx == 1 ? λ : assemble_coordinate(λ, nλ, arch, 1)
+    φG = Ry == 1 ? φ : assemble_coordinate(φ, nφ, arch, 2)
+    zG = Rz == 1 ? z : assemble_coordinate(z, nz, arch, 3)
 
     child_arch = child_architecture(arch)
 
@@ -264,14 +266,18 @@ end
 # take precedence on `DistributedGrid` 
 function with_halo(new_halo, grid::DistributedRectilinearGrid) 
     new_grid = with_halo(new_halo, reconstruct_global_grid(grid))    
-    return scatter_local_grids(architecture(grid), new_grid, size(grid))
+    return scatter_local_grids(new_grid, architecture(grid), size(grid))
 end
 
 function with_halo(new_halo, grid::DistributedLatitudeLongitudeGrid) 
     new_grid = with_halo(new_halo, reconstruct_global_grid(grid))    
-    return scatter_local_grids(architecture(grid), new_grid, size(grid))
+    return scatter_local_grids(new_grid, architecture(grid), size(grid))
 end
 
+# Extending child_architecture for grids
+child_architecture(grid::AbstractGrid) = architecture(grid)
+child_architecture(grid::DistributedGrid) = child_architecture(architecture(grid))
+
 """ 
     scatter_grid_properties(global_grid)
 
@@ -289,13 +295,13 @@ function scatter_grid_properties(global_grid)
     return x, y, z, topo, halo
 end
 
-function scatter_local_grids(arch::Distributed, global_grid::RectilinearGrid, local_size)
+function scatter_local_grids(global_grid::RectilinearGrid, arch::Distributed, local_size)
     x, y, z, topo, halo = scatter_grid_properties(global_grid)
     global_sz = global_size(arch, local_size)
     return RectilinearGrid(arch, eltype(global_grid); size=global_sz, x=x, y=y, z=z, halo=halo, topology=topo)
 end
 
-function scatter_local_grids(arch::Distributed, global_grid::LatitudeLongitudeGrid, local_size)
+function scatter_local_grids(global_grid::LatitudeLongitudeGrid, arch::Distributed, local_size)
     x, y, z, topo, halo = scatter_grid_properties(global_grid)
     global_sz = global_size(arch, local_size)
     return LatitudeLongitudeGrid(arch, eltype(global_grid); size=global_sz, longitude=x, 
diff --git a/src/DistributedComputations/distributed_transpose.jl b/src/DistributedComputations/distributed_transpose.jl
new file mode 100644
index 0000000000..06e765ce1d
--- /dev/null
+++ b/src/DistributedComputations/distributed_transpose.jl
@@ -0,0 +1,193 @@
+using Oceananigans.Grids: architecture
+using Oceananigans.Architectures: on_architecture
+using KernelAbstractions: @index, @kernel
+using MPI: VBuffer, Alltoallv!
+
+# Transpose directions are assumed to work only in the following configuration
+# z -> y -> x -> y -> z
+# where z stands for z-local data, y for y-local data, and x for x-local data
+# The initial field is always assumed to be in the z-complete configuration
+
+# Fallbacks for slab decompositions
+transpose_z_to_y!(::SlabYFields) = nothing
+transpose_y_to_z!(::SlabYFields) = nothing
+transpose_x_to_y!(::SlabXFields) = nothing
+transpose_y_to_x!(::SlabXFields) = nothing
+
+# Since z -> y -> x -> y -> z we only nedd to define the `pack` and `unpack` kernels
+# for the x and z configurations once, y requires two definitions depending on which
+# configuration it's interacting with. Therefore, `_pack_buffer_x!` and `_pack_buffer_z!`
+# are packing the buffer from a x-local configuration and from a z-local configuration. 
+# There is no ambiguity here because the x- and z- configurations communicate only with the y-configuration. 
+# On the other hand, for the y-configuration there are two ways to pack a buffer and two ways to unpack it 
+# depending on whether the y-configuration is going (or coming from) a x- or a z-configuration
+
+@kernel function _pack_buffer_z_to_y!(yzbuff, zfield, N)
+    i, j, k = @index(Global, NTuple)
+    Nx, Ny, _ = N
+    @inbounds yzbuff.send[j + Ny * (i-1 + Nx * (k-1))] = zfield[i, j, k]
+end
+
+@kernel function _pack_buffer_x_to_y!(xybuff, xfield, N)
+    i, j, k = @index(Global, NTuple)
+    _, Ny, Nz = N
+    @inbounds xybuff.send[j + Ny * (k-1 + Nz * (i-1))] = xfield[i, j, k]
+end
+
+# packing a y buffer for communication with a x-local direction (y -> x communication)
+@kernel function _pack_buffer_y_to_x!(xybuff, yfield, N) 
+    i, j, k = @index(Global, NTuple)
+    Nx, _, Nz = N
+    @inbounds xybuff.send[i + Nx * (k-1 + Nz * (j-1))] = yfield[i, j, k]
+end
+
+# packing a y buffer for communication with a z-local direction (y -> z communication)
+@kernel function _pack_buffer_y_to_z!(xybuff, yfield, N) 
+    i, j, k = @index(Global, NTuple)
+    Nx, _, Nz = N
+    @inbounds xybuff.send[k + Nz * (i-1 + Nx * (j-1))] = yfield[i, j, k]
+end
+
+@kernel function _unpack_buffer_x_from_y!(xybuff, xfield, N, n)
+    i, j, k = @index(Global, NTuple)
+    size = n[1], N[2], N[3]
+    @inbounds begin
+        i′  = mod(i - 1, size[1]) + 1
+        m   = (i - 1) ÷ size[1]
+        idx = i′ + size[1] * (k-1 + size[3] * (j-1)) + m * prod(size)
+        xfield[i, j, k] = xybuff.recv[idx]
+    end
+end
+
+@kernel function _unpack_buffer_z_from_y!(yzbuff, zfield, N, n)
+    i, j, k = @index(Global, NTuple)
+    size = N[1], N[2], n[3]
+    @inbounds begin
+        k′  = mod(k - 1, size[3]) + 1
+        m   = (k - 1) ÷ size[3]
+        idx = k′ + size[3] * (i-1 + size[1] * (j-1)) + m * prod(size)
+        zfield[i, j, k] = yzbuff.recv[idx]
+    end
+end
+
+# unpacking a y buffer from a communication with z-local direction (z -> y)
+@kernel function _unpack_buffer_y_from_z!(yzbuff, yfield, N, n) 
+    i, j, k = @index(Global, NTuple)
+    size = N[1], n[2], N[3]
+    @inbounds begin
+        j′  = mod(j - 1, size[2]) + 1
+        m   = (j - 1) ÷ size[2]
+        idx = j′ + size[2] * (i-1 + size[1] * (k-1)) + m * prod(size)
+        yfield[i, j, k] = yzbuff.recv[idx]
+    end
+end
+
+# unpacking a y buffer from a communication with x-local direction (x -> y)
+@kernel function _unpack_buffer_y_from_x!(yzbuff, yfield, N, n) 
+    i, j, k = @index(Global, NTuple)
+    size = N[1], n[2], N[3]
+    @inbounds begin
+        j′  = mod(j - 1, size[2]) + 1
+        m   = (j - 1) ÷ size[2] 
+        idx = j′ + size[2] * (k-1 + size[3] * (i-1)) + m * prod(size)
+        yfield[i, j, k] = yzbuff.recv[idx]
+    end
+end
+
+pack_buffer_x_to_y!(buff, f) = launch!(architecture(f), f.grid, :xyz, _pack_buffer_x_to_y!, buff, f, size(f))
+pack_buffer_z_to_y!(buff, f) = launch!(architecture(f), f.grid, :xyz, _pack_buffer_z_to_y!, buff, f, size(f))
+pack_buffer_y_to_x!(buff, f) = launch!(architecture(f), f.grid, :xyz, _pack_buffer_y_to_x!, buff, f, size(f))
+pack_buffer_y_to_z!(buff, f) = launch!(architecture(f), f.grid, :xyz, _pack_buffer_y_to_z!, buff, f, size(f))
+
+unpack_buffer_x_from_y!(f, fo, buff) = launch!(architecture(f), f.grid, :xyz, _unpack_buffer_x_from_y!, buff, f, size(f), size(fo))
+unpack_buffer_z_from_y!(f, fo, buff) = launch!(architecture(f), f.grid, :xyz, _unpack_buffer_z_from_y!, buff, f, size(f), size(fo))
+unpack_buffer_y_from_x!(f, fo, buff) = launch!(architecture(f), f.grid, :xyz, _unpack_buffer_y_from_x!, buff, f, size(f), size(fo))
+unpack_buffer_y_from_z!(f, fo, buff) = launch!(architecture(f), f.grid, :xyz, _unpack_buffer_y_from_z!, buff, f, size(f), size(fo))
+
+for (from, to, buff) in zip([:y, :z, :y, :x], [:z, :y, :x, :y], [:yz, :yz, :xy, :xy])
+    transpose!      = Symbol(:transpose_, from, :_to_, to, :(!))
+    pack_buffer!    = Symbol(:pack_buffer_, from, :_to_, to, :(!)) 
+    unpack_buffer!  = Symbol(:unpack_buffer_, to, :_from_, from, :(!)) 
+    
+    buffer = Symbol(buff, :buff)
+    fromfield = Symbol(from, :field)
+    tofield = Symbol(to, :field)
+
+    transpose_name = string(transpose!)
+    to_name = string(to)
+    from_name = string(from)
+
+    pack_buffer_name = string(pack_buffer!)
+    unpack_buffer_name = string(unpack_buffer!)
+
+    @eval begin
+        """
+            $($transpose_name)(pf::TransposableField)
+
+        Transpose the fields in `TransposableField` from a $($from_name)-local configuration
+        (located in `pf.$($from_name)field`) to a $($to_name)-local configuration located
+        in `pf.$($to_name)field`.
+
+        Transpose Algorithm:
+        ====================
+
+        The transpose algorithm works in the following manner
+
+        1. We `pack` the three-dimensional data into a one-dimensional buffer to be sent to the other cores
+           We need to synchronize the GPU afterwards before any communication can take place. The packing is
+           done in the `$($pack_buffer_name)` function.
+
+        2. The one-dimensional buffer is communicated to all the cores using an in-place `Alltoallv!` MPI
+           routine. From the [MPI.jl documentation](https://juliaparallel.org/MPI.jl/stable/reference/collective/):
+
+           Every process divides the Buffer into `Comm_size(comm)` chunks of equal size,
+           sending the j-th chunk to the process of rank j-1. Every process stores the data received from rank j-1 process
+           in the j-th chunk of the buffer.
+
+           ```
+           rank    send buf                             recv buf
+           ----    --------                             --------
+           0      a, b, c, d, e, f       Alltoall      a, b, A, B, α, β
+           1      A, B, C, D, E, F  ---------------->  c, d, C, D, γ, ψ
+           2      α, β, γ, ψ, η, ν                     e, f, E, F, η, ν
+           ```
+
+           The `Alltoallv` function allows chunks of different sizes to be sent to different cores by passing a `count`,
+           for the moment, chunks of the same size are passed, requiring that the ranks divide the number of grid
+           cells evenly.
+
+        3. Once the chunks have been communicated, we `unpack` the received one-dimensional buffer into the three-dimensional
+           field making sure the configuration of the data fits the reshaping. The unpacking is
+           done via the `$($unpack_buffer_name)` function.
+
+        Limitations:
+        ============
+
+        - The tranpose is configured to work only in the following four directions:
+         
+          1. z-local to y-local
+          2. y-local to x-local
+          3. x-local to y-local
+          4. y-local to z-local
+
+          i.e., there is no direct transpose connecting a x-local to a z-local configuration.
+
+        - Since (at the moment) the `Alltoallv` allows only chunks of the same size to be communicated, and
+          x-local and z-local only communicate through the y-local configuration, the limitations are that:
+
+          * The number of ranks that divide the x-direction should divide evenly the y-direction
+          * The number of ranks that divide the y-direction should divide evenly the x-direction
+
+          which implies that
+
+          * For 2D fields in XY (flat z-direction) we can traspose only if the partitioning is in X
+        """
+        function $transpose!(pf::TransposableField)
+            $pack_buffer!(pf.$buffer, pf.$fromfield) # pack the one-dimensional buffer for Alltoallv! call
+            sync_device!(architecture(pf.$fromfield)) # Device needs to be synched with host before MPI call
+            Alltoallv!(VBuffer(pf.$buffer.send, pf.counts.$buff), VBuffer(pf.$buffer.recv, pf.counts.$buff), pf.comms.$buff) # Actually transpose!
+            $unpack_buffer!(pf.$tofield, pf.$fromfield, pf.$buffer) # unpack the one-dimensional buffer into the 3D field
+            return nothing
+        end
+    end
+end
diff --git a/src/DistributedComputations/halo_communication.jl b/src/DistributedComputations/halo_communication.jl
index 06264dd39c..872ffa6331 100644
--- a/src/DistributedComputations/halo_communication.jl
+++ b/src/DistributedComputations/halo_communication.jl
@@ -12,6 +12,7 @@ using Oceananigans.BoundaryConditions:
     fill_halo_size,
     fill_halo_offset,
     permute_boundary_conditions,
+    fill_open_boundary_regions!,
     PBCT, DCBCT, DCBC
 
 import Oceananigans.BoundaryConditions:
@@ -97,7 +98,11 @@ function fill_halo_regions!(field::DistributedField, args...; kwargs...)
                               kwargs...)
 end
 
-function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::DistributedGrid, buffers, args...; kwargs...)
+function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::DistributedGrid, buffers, args...; fill_boundary_normal_velocities = true, kwargs...)
+    if fill_boundary_normal_velocities
+        fill_open_boundary_regions!(c, bcs, indices, loc, grid, args...; kwargs...)
+    end
+    
     arch             = architecture(grid)
     fill_halos!, bcs = permute_boundary_conditions(bcs) 
 
diff --git a/src/DistributedComputations/halo_communication_bcs.jl b/src/DistributedComputations/halo_communication_bcs.jl
index c722205558..353110b8a3 100644
--- a/src/DistributedComputations/halo_communication_bcs.jl
+++ b/src/DistributedComputations/halo_communication_bcs.jl
@@ -17,15 +17,15 @@ function inject_halo_communication_boundary_conditions(field_bcs, local_rank, co
     rank_north  = connectivity.north
     rank_south  = connectivity.south
 
-    east_comm_ranks   = HaloCommunicationRanks(from=local_rank, to=rank_east)
-    west_comm_ranks   = HaloCommunicationRanks(from=local_rank, to=rank_west)
-    north_comm_ranks  = HaloCommunicationRanks(from=local_rank, to=rank_north)
-    south_comm_ranks  = HaloCommunicationRanks(from=local_rank, to=rank_south)
-
-    east_comm_bc   = DistributedCommunicationBoundaryCondition(east_comm_ranks)
-    west_comm_bc   = DistributedCommunicationBoundaryCondition(west_comm_ranks)
-    north_comm_bc  = DistributedCommunicationBoundaryCondition(north_comm_ranks)
-    south_comm_bc  = DistributedCommunicationBoundaryCondition(south_comm_ranks)
+    east_comm_ranks  = HaloCommunicationRanks(from=local_rank, to=rank_east)
+    west_comm_ranks  = HaloCommunicationRanks(from=local_rank, to=rank_west)
+    north_comm_ranks = HaloCommunicationRanks(from=local_rank, to=rank_north)
+    south_comm_ranks = HaloCommunicationRanks(from=local_rank, to=rank_south)
+
+    east_comm_bc  = DistributedCommunicationBoundaryCondition(east_comm_ranks)
+    west_comm_bc  = DistributedCommunicationBoundaryCondition(west_comm_ranks)
+    north_comm_bc = DistributedCommunicationBoundaryCondition(north_comm_ranks)
+    south_comm_bc = DistributedCommunicationBoundaryCondition(south_comm_ranks)
 
     TX, TY, _ = topology
 
diff --git a/src/DistributedComputations/partition_assemble.jl b/src/DistributedComputations/partition_assemble.jl
index a6e730cf47..c9bbe6b8d3 100644
--- a/src/DistributedComputations/partition_assemble.jl
+++ b/src/DistributedComputations/partition_assemble.jl
@@ -1,8 +1,8 @@
-import Oceananigans.Architectures: on_architecture
+using Oceananigans.Fields: Field
 
-all_reduce(op, val, arch::Distributed) = 
-    MPI.Allreduce(val, op, arch.communicator)
+import Oceananigans.Architectures: on_architecture
 
+all_reduce(op, val, arch::Distributed) = MPI.Allreduce(val, op, arch.communicator)
 all_reduce(op, val, arch) = val
 
 # MPI Barrier
@@ -10,49 +10,57 @@ barrier!(arch) = nothing
 barrier!(arch::Distributed) = MPI.Barrier(arch.communicator)
 
 """
-    concatenate_local_sizes(n, arch::Distributed) 
+    concatenate_local_sizes(local_size, arch::Distributed) 
 
-Return a 3-Tuple containing a vector of `size(grid, idx)` for each rank in 
+Return a 3-Tuple containing a vector of `size(grid, dim)` for each rank in 
 all 3 directions.
 """
-concatenate_local_sizes(n, arch::Distributed) = 
-    Tuple(concatenate_local_sizes(n, arch, i) for i in 1:length(n))
+concatenate_local_sizes(local_size, arch::Distributed) = 
+    Tuple(concatenate_local_sizes(local_size, arch, d) for d in 1:length(local_size))
+
+concatenate_local_sizes(sz, arch, dim) = concatenate_local_sizes(sz[dim], arch, dim)
 
-function concatenate_local_sizes(n, arch::Distributed, idx)
-    R = arch.ranks[idx]
-    r = arch.local_index[idx]
-    n = n isa Number ? n : n[idx]
-    l = zeros(Int, R)
+function concatenate_local_sizes(n::Number, arch::Distributed, dim)
+    R = arch.ranks[dim]
+    r = arch.local_index[dim]
+    N = zeros(Int, R)
 
-    r1, r2 = arch.local_index[[1, 2, 3] .!= idx]
+    r1, r2 = arch.local_index[[1, 2, 3] .!= dim]
     
     if r1 == 1 && r2 == 1
-        l[r] = n
+        N[r] = n
     end
 
-    MPI.Allreduce!(l, +, arch.communicator)
+    MPI.Allreduce!(N, +, arch.communicator)
     
-    return l
+    return N
 end
 
-# Partitioning (localization of global objects) and assembly (global assembly of local objects)
-# Used for grid constructors (cpu_face_constructor_x, cpu_face_constructor_y, cpu_face_constructor_z)
-# We need to repeat the value at the right boundary
-function partition_coordinate(c::AbstractVector, n, arch, idx)
-    nl = concatenate_local_sizes(n, arch, idx)
-    r  = arch.local_index[idx]
-    # Allow for Face values
-    if r == arch.ranks[idx]
-        return c[1 + sum(nl[1:r-1]) : end]
+"""
+    partition_coordinate(coordinate, n, arch, dim)
+
+Return the local component of the global `coordinate`, which has
+local length `n` and is distributed on `arch`itecture
+in the x-, y-, or z- `dim`ension.
+"""
+function partition_coordinate(c::AbstractVector, n, arch, dim)
+    nl = concatenate_local_sizes(n, arch, dim)
+    r  = arch.local_index[dim]
+
+    start_idx = sum(nl[1:r-1]) + 1 # sum of all previous rank's dimension + 1
+    end_idx   = if r == ranks(arch)[dim]
+        length(c)
     else
-        return c[1 + sum(nl[1:r-1]) : sum(nl[1:r])]
+        sum(nl[1:r]) + 1 
     end
+
+    return c[start_idx:end_idx]
 end
 
-function partition_coordinate(c::Tuple, n, arch, idx)
-    nl = concatenate_local_sizes(n, arch, idx)
+function partition_coordinate(c::Tuple, n, arch, dim)
+    nl = concatenate_local_sizes(n, arch, dim)
     N  = sum(nl)
-    R  = arch.ranks[idx]
+    R  = arch.ranks[dim]
     Δl = (c[2] - c[1]) / N  
 
     l = Tuple{Float64, Float64}[(c[1], c[1] + Δl * nl[1])]
@@ -61,7 +69,7 @@ function partition_coordinate(c::Tuple, n, arch, idx)
         push!(l, (lp, lp + Δl * nl[i]))
     end
 
-    return l[arch.local_index[idx]]
+    return l[arch.local_index[dim]]
 end
 
 """
@@ -72,72 +80,100 @@ a local number of elements `Nc`, number of ranks `Nr`, rank `r`,
 and `arch`itecture. Since we use a global reduction, only ranks at positions
 1 in the other two directions `r1 == 1` and `r2 == 1` fill the 1D array.
 """
-function assemble_coordinate(c_local::AbstractVector, n, R, r, r1, r2, comm) 
-    nl = concatenate_local_sizes(n, R, r)
+function assemble_coordinate(c_local::AbstractVector, n, arch, dim) 
+    nl = concatenate_local_sizes(n, arch, dim)
+    R  = arch.ranks[dim]
+    r  = arch.local_index[dim]
+    r2 = [arch.local_index[i] for i in filter(x -> x != dim, (1, 2, 3))]
 
     c_global = zeros(eltype(c_local), sum(nl)+1)
 
-    if r1 == 1 && r2 == 1
+    if r2[1] == 1 && r2[2] == 1
         c_global[1 + sum(nl[1:r-1]) : sum(nl[1:r])] .= c_local[1:end-1]
-        r == Nr && (c_global[end] = c_local[end])
+        r == R && (c_global[end] = c_local[end])
     end
 
-    MPI.Allreduce!(c_global, +, comm)
+    MPI.Allreduce!(c_global, +, arch.communicator)
 
     return c_global
 end
 
 # Simple case, just take the first and the last core
-function assemble_coordinate(c::Tuple, n, R, r, r1, r2, comm) 
+function assemble_coordinate(c_local::Tuple, n, arch, dim) 
     c_global = zeros(Float64, 2)
-
-    if r == 1 && r1 == 1 && r2 == 1
-        c_global[1] = c[1]
-    elseif r == R && r1 == 1 && r2 == 1
-        c_global[2] = c[2]
+    
+    rank = arch.local_index
+    R    = arch.ranks[dim]
+    r    = rank[dim]
+    r2   = [rank[i] for i in filter(x -> x != dim, (1, 2, 3))]
+
+    if rank[1] == 1 && rank[2] == 1 && rank[3] == 1
+        c_global[1] = c_local[1]
+    elseif r == R && r2[1] == 1 && r2[1] == 1
+        c_global[2] = c_local[2]
     end
 
-    MPI.Allreduce!(c_global, +, comm)
+    MPI.Allreduce!(c_global, +, arch.communicator)
 
     return tuple(c_global...)
 end 
 
-# TODO: partition_global_array and construct_global_array
-# do not currently work for 3D parallelizations
-# (They are not used anywhere in the code at the moment exept for immersed boundaries)
+# TODO: make partition and construct_global_array work for 3D distribution.
+
 """
-    partition_global_array(arch, c_global, (nx, ny, nz))
+    partition(A, b)
+
+Partition the globally-sized `A` into local arrays with the same size as `b`.
+"""
+partition(A, b::Field) = partition(A, architecture(b), size(b))
+partition(F::Field, b::Field) = partition(interior(F), b)
+partition(f::Function, arch, n) = f
+partition(A::AbstractArray, arch::AbstractSerialArchitecture, local_size) = A
 
-Partition a global array in local arrays of size `(nx, ny)` if 2D or `(nx, ny, nz)` is 3D.
-Usefull for boundary arrays, forcings and initial conditions.
 """
-partition_global_array(arch, c_global::AbstractArray, n) = c_global
-partition_global_array(arch, c_global::Function, n)      = c_global 
+    partition(A, arch, local_size)
 
-# Here we assume that we cannot partition in z (we should remove support for that)
-function partition_global_array(arch::Distributed, c_global::AbstractArray, n) 
-    c_global = on_architecture(CPU(), c_global)
+Partition the globally-sized `A` into local arrays with `local_size` on `arch`itecture.
+"""
+function partition(A::AbstractArray, arch::Distributed, local_size) 
+    A = on_architecture(CPU(), A)
 
     ri, rj, rk = arch.local_index
+    dims = length(size(A))
 
-    dims = length(size(c_global))
-    nx, ny, nz = concatenate_local_sizes(n, arch)
+    # Vectors with the local size for every rank
+    nxs, nys, nzs = concatenate_local_sizes(local_size, arch)
 
-    nz = nz[1]
+    # The local size
+    nx = nxs[ri]
+    ny = nys[rj]
+    nz = nzs[1]
+    # @assert (nx, ny, nz) == local_size
 
-    if dims == 2 
-        c_local = zeros(eltype(c_global), nx[ri], ny[rj])
+    up_to = nxs[1:ri-1]
+    including = nxs[1:ri]
+    i₁ = sum(up_to) + 1
+    i₂ = sum(including)
 
-        c_local .= c_global[1 + sum(nx[1:ri-1]) : sum(nx[1:ri]), 
-                            1 + sum(ny[1:rj-1]) : sum(ny[1:rj])]
-    else
-        c_local = zeros(eltype(c_global), nx[ri], ny[rj], nz)
+    up_to = nys[1:rj-1]
+    including = nys[1:rj]
+    j₁ = sum(up_to) + 1
+    j₂ = sum(including)
+
+    ii = UnitRange(i₁, i₂)
+    jj = UnitRange(j₁, j₂)
+    kk = 1:nz # no partitioning in z
 
-        c_local .= c_global[1 + sum(nx[1:ri-1]) : sum(nx[1:ri]), 
-                            1 + sum(ny[1:rj-1]) : sum(ny[1:rj]), 
-                            1:nz]
+    # TODO: undo this toxic assumption that all 2D arrays span x, y.
+    if dims == 2 
+        a = zeros(eltype(A), nx, ny)
+        a .= A[ii, jj]
+    else
+        a = zeros(eltype(A), nx, ny, nz)
+        a .= A[ii, jj, 1:nz]
     end
-    return on_architecture(child_architecture(arch), c_local)
+
+    return on_architecture(child_architecture(arch), a)
 end
 
 """
@@ -149,7 +185,7 @@ Usefull for boundary arrays, forcings and initial conditions.
 construct_global_array(arch, c_local::AbstractArray, n) = c_local
 construct_global_array(arch, c_local::Function, N)      = c_local
 
-# TODO: This does not work for 3D parallelizations!!!
+# TODO: This does not work for 3D parallelizations
 function construct_global_array(arch::Distributed, c_local::AbstractArray, n) 
     c_local = on_architecture(CPU(), c_local)
 
diff --git a/src/DistributedComputations/plan_distributed_transforms.jl b/src/DistributedComputations/plan_distributed_transforms.jl
new file mode 100644
index 0000000000..e93a5f589a
--- /dev/null
+++ b/src/DistributedComputations/plan_distributed_transforms.jl
@@ -0,0 +1,42 @@
+using Oceananigans.Architectures: array_type
+using Oceananigans.Solvers: plan_forward_transform, plan_backward_transform, DiscreteTransform
+using Oceananigans.Solvers: Forward, Backward
+
+@inline reshaped_size(grid) = size(grid, 2), size(grid, 1), size(grid, 3)
+
+function plan_distributed_transforms(global_grid, storage::TransposableField, planner_flag)
+    topo = topology(global_grid)
+    arch = architecture(global_grid)
+
+    grids = (storage.zfield.grid, storage.yfield.grid, storage.xfield.grid)
+
+    rs_size    = reshaped_size(grids[2])
+    rs_storage = reshape(parent(storage.yfield), rs_size)
+
+    forward_plan_x  =  plan_forward_transform(parent(storage.xfield), topo[1](), [1], planner_flag)
+    forward_plan_z  =  plan_forward_transform(parent(storage.zfield), topo[3](), [3], planner_flag)
+    backward_plan_x = plan_backward_transform(parent(storage.xfield), topo[1](), [1], planner_flag)
+    backward_plan_z = plan_backward_transform(parent(storage.zfield), topo[3](), [3], planner_flag)
+
+    if arch isa GPU
+        forward_plan_y  =  plan_forward_transform(rs_storage, topo[2](), [1], planner_flag) 
+        backward_plan_y = plan_backward_transform(rs_storage, topo[2](), [1], planner_flag) 
+    else
+        forward_plan_y  =  plan_forward_transform(parent(storage.yfield), topo[2](), [2], planner_flag) 
+        backward_plan_y = plan_backward_transform(parent(storage.yfield), topo[2](), [2], planner_flag) 
+    end
+
+    forward_operations = (
+        z! = DiscreteTransform(forward_plan_z, Forward(), grids[1], [3]),
+        y! = DiscreteTransform(forward_plan_y, Forward(), grids[2], [2]),
+        x! = DiscreteTransform(forward_plan_x, Forward(), grids[3], [1]),
+    )
+
+    backward_operations = (
+        x! = DiscreteTransform(backward_plan_x, Backward(), grids[3], [1]),
+        y! = DiscreteTransform(backward_plan_y, Backward(), grids[2], [2]),
+        z! = DiscreteTransform(backward_plan_z, Backward(), grids[1], [3]),
+    )
+
+    return (; forward = forward_operations, backward = backward_operations)
+end
\ No newline at end of file
diff --git a/src/DistributedComputations/transposable_field.jl b/src/DistributedComputations/transposable_field.jl
new file mode 100644
index 0000000000..be630cc46a
--- /dev/null
+++ b/src/DistributedComputations/transposable_field.jl
@@ -0,0 +1,200 @@
+using Oceananigans.Grids: architecture, deflate_tuple
+using Oceananigans.Architectures: on_architecture
+
+struct TransposableField{FX, FY, FZ, YZ, XY, C, Comms}
+    xfield :: FX # X-direction is free (x-local)
+    yfield :: FY # Y-direction is free (y-local)
+    zfield :: FZ # Z-direction is free (original field, z-local)
+    yzbuff :: YZ # if `nothing` slab decomposition with `Ry == 1`
+    xybuff :: XY # if `nothing` slab decomposition with `Rx == 1`
+    counts :: C
+    comms  :: Comms
+end
+
+const SlabYFields = TransposableField{<:Any, <:Any, <:Any, <:Nothing} # Y-direction is free
+const SlabXFields = TransposableField{<:Any, <:Any, <:Any, <:Any, <:Nothing} # X-direction is free
+
+"""
+    TransposableField(field_in, FT = eltype(field_in); with_halos = false)
+
+Construct a TransposableField object that containes the allocated memory and the ruleset required
+for distributed transpositions. This includes:
+- `xfield`: A field with an unpartitioned x-direction (x-local)
+- `yfield`: A field with an unpartitioned y-direction (y-local)
+- `zfield`: A field with an unpartitioned z-direction (z-local)
+- one-dimensional buffers for performing communication between the different configurations, in particular:
+    - `yzbuffer`: A buffer for communication between the z- and y-configurations
+    - `xybuffer`: A buffer for communication between the y- and x-configurations
+  These buffers are "packed" with the three dimensional data and then "unpacked" in the target configuration once
+  received by the target rank.
+- `counts`: The size of the chunks in the buffers to be sent and received
+- `comms`: The MPI communicators for the yz and xy directions (different from MPI.COMM_WORLD!!!)
+
+A `TransposableField` object is used to perform distributed transpositions between different configurations with the 
+`transpose_z_to_y!`, `transpose_y_to_x!`, `transpose_x_to_y!`, and `transpose_y_to_z!` functions. 
+In particular:
+- `transpose_z_to_y!` copies data from the z-configuration (`zfield`) to the y-configuration (`yfield`)
+- `transpose_y_to_x!` copies data from the y-configuration (`yfield`) to the x-configuration (`xfield`)
+- `transpose_x_to_y!` copies data from the x-configuration (`xfield`) to the y-configuration (`yfield`)
+- `transpose_y_to_z!` copies data from the y-configuration (`yfield`) to the z-configuration (`zfield`)
+
+For more information on the transposition algorithm, see the docstring for the `transpose` functions.
+
+# Arguments
+- `field_in`: The input field. It needs to be in a _z-free_ configuration (i.e. ranks[3] == 1).
+- `FT`: The element type of the field. Defaults to the element type of `field_in`.
+- `with_halos`: A boolean indicating whether to include halos in the field. Defaults to `false`.
+"""
+function TransposableField(field_in, FT = eltype(field_in); with_halos = false)
+
+    zgrid = field_in.grid # We support only a 2D partition in X and Y
+    ygrid = twin_grid(zgrid; local_direction = :y)
+    xgrid = twin_grid(zgrid; local_direction = :x)
+
+    xN = size(xgrid)
+    yN = size(ygrid)
+    zN = size(zgrid)
+
+    zarch = architecture(zgrid)
+    yarch = architecture(ygrid)
+
+    loc = location(field_in)
+
+    Rx, Ry, _ = zarch.ranks
+    if with_halos 
+        zfield = Field(loc, zgrid, FT)
+        yfield = Ry == 1 ? zfield : Field(loc, ygrid, FT)
+        xfield = Rx == 1 ? yfield : Field(loc, xgrid, FT)
+    else
+        zfield = Field(loc, zgrid, FT; indices = (1:zN[1], 1:zN[2], 1:zN[3]))
+        yfield = Ry == 1 ? zfield : Field(loc, ygrid, FT; indices = (1:yN[1], 1:yN[2], 1:yN[3]))
+        xfield = Rx == 1 ? yfield : Field(loc, xgrid, FT; indices = (1:xN[1], 1:xN[2], 1:xN[3]))
+    end
+
+    # One dimensional buffers to "pack" three-dimensional data in for communication 
+    yzbuffer = Ry == 1 ? nothing : (send = on_architecture(zarch, zeros(FT, prod(yN))), 
+                                    recv = on_architecture(zarch, zeros(FT, prod(zN))))
+    xybuffer = Rx == 1 ? nothing : (send = on_architecture(zarch, zeros(FT, prod(xN))), 
+                                    recv = on_architecture(zarch, zeros(FT, prod(yN))))
+
+    yzcomm = MPI.Comm_split(MPI.COMM_WORLD, zarch.local_index[1], zarch.local_index[1])
+    xycomm = MPI.Comm_split(MPI.COMM_WORLD, yarch.local_index[3], yarch.local_index[3])
+
+    zRx, zRy, zRz = ranks(zarch) 
+    yRx, yRy, yRz = ranks(yarch) 
+
+    # size of the chunks in the buffers to be sent and received
+    # (see the docstring for the `transpose` algorithms)    
+    yzcounts = zeros(Int, zRy * zRz)
+    xycounts = zeros(Int, yRx * yRy)
+
+    yzrank = MPI.Comm_rank(yzcomm)
+    xyrank = MPI.Comm_rank(xycomm)
+
+    yzcounts[yzrank + 1] = yN[1] * zN[2] * yN[3]
+    xycounts[xyrank + 1] = yN[1] * xN[2] * xN[3]
+
+    MPI.Allreduce!(yzcounts, +, yzcomm)
+    MPI.Allreduce!(xycounts, +, xycomm)
+
+    return TransposableField(xfield, yfield, zfield, 
+                             yzbuffer, xybuffer,
+                             (; yz = yzcounts, xy = xycounts),
+                             (; yz = yzcomm,   xy = xycomm))
+end
+
+#####
+##### Twin transposed grid
+#####
+
+"""
+    twin_grid(grid::DistributedGrid; local_direction = :y)
+
+Construct a "twin" grid based on the provided distributed `grid` object.
+The twin grid is a grid that discretizes the same domain of the original grid, just with a
+different partitioning strategy whereas the "local dimension" (i.e. the non-partitioned dimension)
+is specified by the keyword argument `local_direction`. This could be either `:x` or `:y`.
+
+Note that `local_direction = :z` will return the original grid as we do not allow partitioning in
+the `z` direction.
+"""
+function twin_grid(grid::DistributedGrid; local_direction = :y)
+
+    arch = grid.architecture
+    ri, rj, rk = arch.local_index
+
+    R = arch.ranks
+
+    nx, ny, nz = n = size(grid)
+    Nx, Ny, Nz = global_size(arch, n)
+
+    TX, TY, TZ = topology(grid)
+
+    TX = reconstruct_global_topology(TX, R[1], ri, rj, rk, arch.communicator)
+    TY = reconstruct_global_topology(TY, R[2], rj, ri, rk, arch.communicator)
+    TZ = reconstruct_global_topology(TZ, R[3], rk, ri, rj, arch.communicator)
+
+    x = cpu_face_constructor_x(grid)
+    y = cpu_face_constructor_y(grid)
+    z = cpu_face_constructor_z(grid)
+
+    xG = R[1] == 1 ? x : assemble_coordinate(x, nx, arch, 1)
+    yG = R[2] == 1 ? y : assemble_coordinate(y, ny, arch, 2)
+    zG = R[3] == 1 ? z : assemble_coordinate(z, nz, arch, 3)
+
+    child_arch = child_architecture(arch)
+
+    FT = eltype(grid)
+
+    if local_direction == :y
+        ranks = R[1], 1, R[2]
+
+        nnx, nny, nnz = nx, Ny, nz ÷ ranks[3]
+
+        if (nnz * ranks[3] < Nz) && (rj == ranks[3])
+            nnz = Nz - nnz * (ranks[3] - 1)
+        end
+    elseif local_direction == :x
+        ranks = 1, R[1], R[2]
+
+        nnx, nny, nnz = Nx, Ny ÷ ranks[2], nz ÷ ranks[3]
+
+        if (nny * ranks[2] < Ny) && (ri == ranks[2])
+            nny = Ny - nny * (ranks[2] - 1)
+        end
+    elseif local_direction == :z
+        #TODO: a warning here?
+        return grid
+    end
+
+    new_arch  = Distributed(child_arch; partition = Partition(ranks...))
+    global_sz = global_size(new_arch, (nnx, nny, nnz))
+    global_sz = deflate_tuple(TX, TY, TZ, global_sz)
+
+    return construct_grid(grid, new_arch, FT; 
+                          size = global_sz, 
+                          x = xG, y = yG, z = zG,
+                          topology = (TX, TY, TZ))
+end
+
+function construct_grid(::RectilinearGrid, arch, FT; size, x, y, z, topology) 
+    TX, TY, TZ = topology
+    x = TX == Flat ? nothing : x
+    y = TY == Flat ? nothing : y
+    z = TZ == Flat ? nothing : z
+
+    return RectilinearGrid(arch, FT; size, 
+                           x, y, z,
+                           topology)
+end
+
+function construct_grid(::LatitudeLongitudeGrid, arch, FT; size, x, y, z, topology) 
+    TX, TY, TZ = topology
+    longitude = TX == Flat ? nothing : x
+    latitude  = TY == Flat ? nothing : y
+    z         = TZ == Flat ? nothing : z
+
+    return LatitudeLongitudeGrid(arch, FT; size, 
+                                 longitude, latitude, z,
+                                 topology)
+end
diff --git a/src/Fields/Fields.jl b/src/Fields/Fields.jl
index 56b2d11594..caa949d0c4 100644
--- a/src/Fields/Fields.jl
+++ b/src/Fields/Fields.jl
@@ -47,6 +47,8 @@ end
 
 @inline field(loc, a::Function, grid) = FunctionField(loc, a, grid)
 @inline field(loc, a::Number, grid) = ConstantField(a)
+@inline field(loc, a::ZeroField, grid) = a
+@inline field(loc, a::ConstantField, grid) = a
 
 @inline function field(loc, f::Field, grid)
     loc === location(f) && grid === f.grid && return f
diff --git a/src/Fields/abstract_field.jl b/src/Fields/abstract_field.jl
index f4d4fde9d8..c8c9b7b5f1 100644
--- a/src/Fields/abstract_field.jl
+++ b/src/Fields/abstract_field.jl
@@ -10,7 +10,7 @@ using Oceananigans.Grids: interior_indices, interior_parent_indices
 
 import Base: minimum, maximum, extrema
 import Oceananigans: location, instantiated_location
-import Oceananigans.Architectures: architecture
+import Oceananigans.Architectures: architecture, child_architecture
 import Oceananigans.Grids: interior_x_indices, interior_y_indices, interior_z_indices
 import Oceananigans.Grids: total_size, topology, nodes, xnodes, ynodes, znodes, node, xnode, ynode, znode
 import Oceananigans.Utils: datatuple
@@ -39,9 +39,11 @@ Base.IndexStyle(::AbstractField) = IndexCartesian()
 @inline location(a, i) = location(a)[i]
 @inline location(::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} = (LX, LY, LZ) # note no instantiation
 @inline instantiated_location(::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} = (LX(), LY(), LZ())
+Base.eltype(::AbstractField{<:Any, <:Any, <:Any, <:Any, T}) where T = T
 
 "Returns the architecture of on which `f` is defined."
 architecture(f::AbstractField) = architecture(f.grid)
+child_architecture(f::AbstractField) = child_architecture(architecture(f))
 
 "Returns the topology of a fields' `grid`."
 @inline topology(f::AbstractField, args...) = topology(f.grid, args...)
@@ -57,6 +59,39 @@ Base.size(f::AbstractField) = size(f.grid, location(f))
 Base.length(f::AbstractField) = prod(size(f))
 Base.parent(f::AbstractField) = f
 
+const Abstract3DField = AbstractField{<:Any, <:Any, <:Any, <:Any, <:Any, 3}
+const Abstract4DField = AbstractField{<:Any, <:Any, <:Any, <:Any, <:Any, 4}
+
+# TODO: to omit boundaries on Face fields, we have to return 2:N
+# when topo=Bounded, and loc=Face
+@inline axis(::Colon, N) = Base.OneTo(N)
+@inline axis(index::UnitRange, N) = index
+
+@inline function Base.axes(f::Abstract3DField)
+    Nx, Ny, Nz = size(f)
+    ix, iy, iz = indices(f)
+
+    ax = axis(ix, Nx)
+    ay = axis(iy, Ny)
+    az = axis(iz, Nz)
+
+    return (ax, ay, az)
+end
+
+@inline function Base.axes(f::Abstract4DField)
+    Nx, Ny, Nz, Nt = size(f)
+    ix, iy, iz = indices(f)
+
+    ax = axis(ix, Nx)
+    ay = axis(iy, Ny)
+    az = axis(iz, Nz)
+    at = Base.OneTo(Nt)
+
+    return (ax, ay, az, at)
+end
+
+
+
 """
     total_size(field::AbstractField)
 
diff --git a/src/Fields/background_fields.jl b/src/Fields/background_fields.jl
index 9ac557cb5d..7f34e955ac 100644
--- a/src/Fields/background_fields.jl
+++ b/src/Fields/background_fields.jl
@@ -60,6 +60,8 @@ func(x, y, z, t, parameters)
 """
 BackgroundField(func; parameters=nothing) = BackgroundField(func, parameters)
 
+regularize_background_field(LX, LY, LZ, bf::BackgroundField{<:Number}, grid, clock) = ConstantField(bf.func)
+
 regularize_background_field(LX, LY, LZ, f::BackgroundField{<:Function}, grid, clock) =
     FunctionField{LX, LY, LZ}(f.func, grid; clock=clock, parameters=f.parameters)
 
diff --git a/src/Fields/field.jl b/src/Fields/field.jl
index 81021f3743..ee806db849 100644
--- a/src/Fields/field.jl
+++ b/src/Fields/field.jl
@@ -6,6 +6,7 @@ using Adapt
 using KernelAbstractions: @kernel, @index
 using Base: @propagate_inbounds
 
+import Oceananigans: boundary_conditions
 import Oceananigans.Architectures: on_architecture
 import Oceananigans.BoundaryConditions: fill_halo_regions!, getbc
 import Statistics: norm, mean, mean!
@@ -251,6 +252,9 @@ function offset_windowed_data(data, data_indices, Loc, grid, view_indices)
     return offset_data(windowed_parent, loc, topo, sz, halo, view_indices)
 end
 
+convert_colon_indices(view_indices, field_indices) = view_indices
+convert_colon_indices(::Colon, field_indices) = field_indices
+
 """
     view(f::Field, indices...)
 
@@ -303,7 +307,7 @@ function Base.view(f::Field, i, j, k)
     loc = location(f)
 
     # Validate indices (convert Int to UnitRange, error for invalid indices)
-    view_indices = validate_indices((i, j, k), loc, f.grid)
+    view_indices = i, j, k = validate_indices((i, j, k), loc, f.grid)
 
     if view_indices == f.indices # nothing to "view" here
         return f # we want the whole field after all.
@@ -315,6 +319,8 @@ function Base.view(f::Field, i, j, k)
     all(valid_view_indices) ||
         throw(ArgumentError("view indices $((i, j, k)) do not intersect field indices $(f.indices)"))
 
+    view_indices = map(convert_colon_indices, view_indices, f.indices)
+
     # Choice: OffsetArray of view of OffsetArray, or OffsetArray of view?
     #     -> the first retains a reference to the original f.data (an OffsetArray)
     #     -> the second loses it, so we'd have to "re-offset" the underlying data to access.
@@ -349,13 +355,8 @@ Base.view(f::Field, i, j) = view(f, i, j, :)
 
 boundary_conditions(not_field) = nothing
 
-function boundary_conditions(f::Field)
-    if f.indices === default_indices(3) # default boundary conditions
-        return f.boundary_conditions
-    else # filter boundary conditions in windowed directions
-        return FieldBoundaryConditions(f.indices, f.boundary_conditions)
-    end
-end
+@inline boundary_conditions(f::Field) = f.boundary_conditions
+@inline boundary_conditions(w::WindowedField) = FieldBoundaryConditions(w.indices, w.boundary_conditions)
 
 immersed_boundary_condition(f::Field) = f.boundary_conditions.immersed
 data(field::Field) = field.data
@@ -397,14 +398,6 @@ interior(f::Field, I...) = view(interior(f), I...)
 # Don't use axes(f) to checkbounds; use axes(f.data)
 Base.checkbounds(f::Field, I...) = Base.checkbounds(f.data, I...)
 
-function Base.axes(f::Field)
-    if f.indices === (:, : ,:)
-        return Base.OneTo.(size(f))
-    else
-        return Tuple(f.indices[i] isa Colon ? Base.OneTo(size(f, i)) : f.indices[i] for i = 1:3)
-    end
-end
-
 @propagate_inbounds Base.getindex(f::Field, inds...) = getindex(f.data, inds...)
 @propagate_inbounds Base.getindex(f::Field, i::Int)  = parent(f)[i]
 @propagate_inbounds Base.setindex!(f::Field, val, i, j, k) = setindex!(f.data, val, i, j, k)
@@ -774,4 +767,3 @@ function fill_halo_regions!(field::Field, args...; kwargs...)
 
     return nothing
 end
-
diff --git a/src/Fields/function_field.jl b/src/Fields/function_field.jl
index 0a35dfd0af..41b2c8b5fc 100644
--- a/src/Fields/function_field.jl
+++ b/src/Fields/function_field.jl
@@ -26,11 +26,6 @@ struct FunctionField{LX, LY, LZ, C, P, F, G, T} <: AbstractField{LX, LY, LZ, G,
         return new{LX, LY, LZ, C, P, F, G, FT}(func, grid, clock, parameters)
     end
 
-    @doc """
-        FunctionField{LX, LY, LZ}(func::FunctionField, grid; clock) where {LX, LY, LZ}
-
-    Adds `clock` to an existing `FunctionField` and relocates it to `(LX, LY, LZ)` on `grid`.
-    """
     @inline function FunctionField{LX, LY, LZ}(f::FunctionField,
                                                grid::G;
                                                clock::C=nothing) where {LX, LY, LZ, G, C}
@@ -45,12 +40,7 @@ end
 fieldify_function(L, a, grid) = a
 fieldify_function(L, a::Function, grid) = FunctionField(L, a, grid)
 
-"""
-    FunctionField(L::Tuple, func, grid)
-
-Returns a stationary `FunctionField` on `grid` and at location `L = (LX, LY, LZ)`,
-where `func` is callable with signature `func(x, y, z)`.
-"""
+# This is a convenience form with `L` as positional argument.
 @inline FunctionField(L::Tuple, func, grid) = FunctionField{L[1], L[2], L[3]}(func, grid)
 
 @inline indices(::FunctionField) = (:, :, :)
diff --git a/src/Fields/interpolate.jl b/src/Fields/interpolate.jl
index 85a5cdf10b..6acd5b7ad9 100644
--- a/src/Fields/interpolate.jl
+++ b/src/Fields/interpolate.jl
@@ -1,4 +1,4 @@
-using Oceananigans.Grids: topology, node,
+using Oceananigans.Grids: topology, node, _node,
                           xspacings, yspacings, zspacings, λspacings, φspacings,
                           XFlatGrid, YFlatGrid, ZFlatGrid,
                           XYFlatGrid, YZFlatGrid, XZFlatGrid,
@@ -183,7 +183,6 @@ end
 @inline function _fractional_indices((x, y), grid, ℓx, ℓy, ::Nothing)
     ii = fractional_x_index(x, (ℓx, ℓy, nothing), grid)
     jj = fractional_y_index(y, (ℓx, ℓy, nothing), grid)
-
     return (ii, jj, nothing)
 end
 
@@ -237,7 +236,6 @@ where `at_node` is a tuple of coordinates and and `from_loc = (ℓx, ℓy, ℓz)
 Note that this is a lower-level `interpolate` method defined for use in CPU/GPU kernels.
 """
 @inline function interpolate(at_node, from_field, from_loc, from_grid)
-    # field, LX, LY, LZ, grid, x, y, z)
     ii, jj, kk = fractional_indices(at_node, from_grid, from_loc...)
 
     ix = interpolator(ii)
@@ -314,11 +312,27 @@ Interpolate `field` to the physical point `(x, y, z)` using trilinear interpolat
     return interpolate(to_node, from_field, from_loc, from_field.grid)
 end
 
+@inline flatten_node(x, y, z) = (x, y, z)
+
+@inline flatten_node(::Nothing, y, z) = flatten_node(y, z)
+@inline flatten_node(x, ::Nothing, z) = flatten_node(x, z)
+@inline flatten_node(x, y, ::Nothing) = flatten_node(x, y)
+
+@inline flatten_node(x, y) = (x, y)
+@inline flatten_node(::Nothing, y) = flatten_node(y)
+@inline flatten_node(x, ::Nothing) = flatten_node(x)
+
+@inline flatten_node(x) = tuple(x)
+@inline flatten_node(::Nothing) = tuple()
+
 @kernel function _interpolate!(to_field, to_grid, to_location,
                                from_field, from_grid, from_location)
 
     i, j, k = @index(Global, NTuple)
-    to_node = node(i, j, k, to_grid, to_location...)
+
+    to_node = _node(i, j, k, to_grid, to_location...)
+    to_node = flatten_node(to_node...)
+
     @inbounds to_field[i, j, k] = interpolate(to_node, from_field, from_location, from_grid)
 end
 
diff --git a/src/Fields/set!.jl b/src/Fields/set!.jl
index ebd07c8ae5..e4b8a29cb6 100644
--- a/src/Fields/set!.jl
+++ b/src/Fields/set!.jl
@@ -3,9 +3,24 @@ using KernelAbstractions: @kernel, @index
 using Adapt: adapt_structure
 
 using Oceananigans.Grids: on_architecture, node_names
-using Oceananigans.Architectures: device, GPU, CPU
+using Oceananigans.Architectures: child_architecture, device, GPU, CPU
 using Oceananigans.Utils: work_layout
 
+#####
+##### Utilities
+#####
+
+function tuple_string(tup::Tuple)
+    str = prod(string(t, ", ") for t in tup)
+    return str[1:end-2] # remove trailing ", "
+end
+
+tuple_string(tup::Tuple{}) = ""
+
+#####
+##### set!
+#####
+
 function set!(Φ::NamedTuple; kwargs...)
     for (fldname, value) in kwargs
         ϕ = getproperty(Φ, fldname)
@@ -14,25 +29,29 @@ function set!(Φ::NamedTuple; kwargs...)
     return nothing
 end
 
+# This interface helps us do things like set distributed fields
+set!(u::Field, f::Function) = set_to_function!(u, f)
+set!(u::Field, a::Union{Array, CuArray, OffsetArray}) = set_to_array!(u, a)
+set!(u::Field, v::Field) = set_to_field!(u, v)
+
 function set!(u::Field, v)
     u .= v # fallback
     return u
 end
 
-function tuple_string(tup::Tuple)
-    str = prod(string(t, ", ") for t in tup)
-    return str[1:end-2] # remove trailing ", "
-end
-
-tuple_string(tup::Tuple{}) = ""
+#####
+##### Setting to specific things
+#####
 
-function set!(u::Field, f::Function)
+function set_to_function!(u, f)
+    # Supports serial and distributed
+    arch = child_architecture(u)
 
     # Determine cpu_grid and cpu_u
-    if architecture(u) isa GPU
+    if arch isa GPU
         cpu_grid = on_architecture(CPU(), u.grid)
         cpu_u = Field(location(u), cpu_grid; indices = indices(u))
-    elseif architecture(u) isa CPU
+    elseif arch isa CPU
         cpu_grid = u.grid
         cpu_u = u
     end
@@ -40,7 +59,7 @@ function set!(u::Field, f::Function)
     # Form a FunctionField from `f`
     f_field = field(location(u), f, cpu_grid)
 
-    # Try to set the FuncitonField to cpu_u
+    # Try to set the FunctionField to cpu_u
     try
         set!(cpu_u, f_field)
     catch err
@@ -62,14 +81,14 @@ function set!(u::Field, f::Function)
     end
 
     # Transfer data to GPU if u is on the GPU
-    if architecture(u) isa GPU
+    if child_architecture(u) isa GPU
         set!(u, cpu_u)
     end
 
     return u
 end
 
-function set!(u::Field, f::Union{Array, CuArray, OffsetArray})
+function set_to_array!(u, f)
     f = on_architecture(architecture(u), f)
 
     try
@@ -91,11 +110,11 @@ function set!(u::Field, f::Union{Array, CuArray, OffsetArray})
     return u
 end
 
-function set!(u::Field, v::Field)
+function set_to_field!(u, v)
     # We implement some niceities in here that attempt to copy halo data,
     # and revert to copying just interior points if that fails.
     
-    if architecture(u) === architecture(v)
+    if child_architecture(u) === child_architecture(v)
         # Note: we could try to copy first halo point even when halo
         # regions are a different size. That's a bit more complicated than
         # the below so we leave it for the future.
@@ -107,7 +126,7 @@ function set!(u::Field, v::Field)
             interior(u) .= interior(v)
         end
     else
-        v_data = on_architecture(architecture(u), v.data)
+        v_data = on_architecture(child_architecture(u), v.data)
         
         # As above, we permit ourselves a little ambition and try to copy halo data:
         try
diff --git a/src/Forcings/Forcings.jl b/src/Forcings/Forcings.jl
index 0df515ba82..2c9ccf4835 100644
--- a/src/Forcings/Forcings.jl
+++ b/src/Forcings/Forcings.jl
@@ -3,6 +3,8 @@ module Forcings
 export Forcing, ContinuousForcing, DiscreteForcing, Relaxation, GaussianMask, LinearTarget, AdvectiveForcing
 
 using Oceananigans.Fields
+using Oceananigans.OutputReaders: FlavorOfFTS
+using Oceananigans.Units: Time
 import Oceananigans.Architectures: on_architecture
 
 include("multiple_forcings.jl")
diff --git a/src/Forcings/forcing.jl b/src/Forcings/forcing.jl
index 6757b6452b..c3f405be62 100644
--- a/src/Forcings/forcing.jl
+++ b/src/Forcings/forcing.jl
@@ -164,6 +164,9 @@ end
 # Support the case that forcing data is loaded in a 3D array:
 @inline array_forcing_func(i, j, k, grid, clock, fields, a) = @inbounds a[i, j, k]
 
+# Support the case that forcing data is loaded in a 4D `FieldTimeSeries`:
+@inline field_time_series_forcing_func(i, j, k, grid, clock, fields, a::FlavorOfFTS) = @inbounds a[i, j, k, Time(clock.time)]
+
 """
     Forcing(array::AbstractArray)
 
@@ -173,3 +176,12 @@ Forcing is computed by calling `array[i, j, k]`, so `array` must be 3D with `siz
 """
 Forcing(array::AbstractArray) = Forcing(array_forcing_func; discrete_form=true, parameters=array)
 
+"""
+    Forcing(array::FlavorOfFTS)
+
+Return a `Forcing` by a `FieldTimeSeries`, which can be added to the tendency of a model field.
+
+Forcing is computed by calling `fts[i, j, k, Time(clock.time)]`, so the `FieldTimeSeries` must have the spatial dimensions of the `grid`.
+"""
+Forcing(fts::FlavorOfFTS) = Forcing(field_time_series_forcing_func; discrete_form=true, parameters=fts)
+
diff --git a/src/Forcings/model_forcing.jl b/src/Forcings/model_forcing.jl
index 2870e62fe2..a181eb9632 100644
--- a/src/Forcings/model_forcing.jl
+++ b/src/Forcings/model_forcing.jl
@@ -27,6 +27,8 @@ regularize_forcing(::Nothing, field::AbstractField, field_name, model_field_name
 
 # TODO: some checking that `array` is validly-sized could be done here
 regularize_forcing(array::AbstractArray, field::AbstractField, field_name, model_field_names) = Forcing(array)
+regularize_forcing(fts::FlavorOfFTS, field::AbstractField, field_name, model_field_names) = Forcing(fts)
+
 
 """
     model_forcing(model_fields; forcings...)
diff --git a/src/Forcings/multiple_forcings.jl b/src/Forcings/multiple_forcings.jl
index 30123ec36d..be3aaa0dd8 100644
--- a/src/Forcings/multiple_forcings.jl
+++ b/src/Forcings/multiple_forcings.jl
@@ -21,12 +21,16 @@ function MultipleForcings(forcings)
     return MultipleForcings{N, F}(forcings)
 end
 
+MultipleForcings(args...) = MultipleForcings(tuple(args...))
+
 function regularize_forcing(forcing_tuple::Tuple, field, field_name, model_field_names)
     forcings = Tuple(regularize_forcing(f, field, field_name, model_field_names)
                      for f in forcing_tuple)
     return MultipleForcings(forcings)
 end
 
+regularize_forcing(mf::MultipleForcings, args...) = regularize_forcing(mf.forcings, args...)
+
 @inline (mf::MultipleForcings{1})(i, j, k, grid, clock, model_fields) = mf.forcings[1](i, j, k, grid, clock, model_fields)
     
 @inline (mf::MultipleForcings{2})(i, j, k, grid, clock, model_fields) = mf.forcings[1](i, j, k, grid, clock, model_fields) +
@@ -41,18 +45,14 @@ end
                                                                         mf.forcings[3](i, j, k, grid, clock, model_fields) +
                                                                         mf.forcings[4](i, j, k, grid, clock, model_fields)
 
-# The magic (which doesn't seem to work on GPU now)
-@inline function (mf::MultipleForcings{N})(i, j, k, grid, clock, model_fields) where N
-    total_forcing = zero(grid)
-    forcings = mf.forcings
-    ntuple(Val(N)) do n
+@generated function (mf::MultipleForcings{N})(i, j, k, grid, clock, model_fields) where N
+    quote
+        total_forcing = zero(grid)
+        forcings = mf.forcings
         Base.@_inline_meta
-        @inbounds begin
-            nth_forcing = forcings[n]
-            total_forcing += nth_forcing(i, j, k, grid, clock, model_fields)
-        end
+        $([:(@inbounds total_forcing += forcings[$n](i, j, k, grid, clock, model_fields)) for n in 1:N]...)
+        return total_forcing
     end
-    return total_forcing
 end
 
 Base.summary(mf::MultipleForcings) = string("MultipleForcings with ", length(mf.forcings), " forcing",
diff --git a/src/Grids/Grids.jl b/src/Grids/Grids.jl
index 8fccf0e75a..fd8aebbf21 100644
--- a/src/Grids/Grids.jl
+++ b/src/Grids/Grids.jl
@@ -103,35 +103,6 @@ Grid topology for dimensions that are connected to other models or domains only
 """
 struct RightConnected <: AbstractTopology end
 
-"""
-    AbstractGrid{FT, TX, TY, TZ}
-
-Abstract supertype for grids with elements of type `FT` and topology `{TX, TY, TZ}`.
-"""
-abstract type AbstractGrid{FT, TX, TY, TZ, Arch} end
-
-"""
-    AbstractUnderlyingGrid{FT, TX, TY, TZ}
-
-Abstract supertype for "primary" grids (as opposed to grids with immersed boundaries)
-with elements of type `FT` and topology `{TX, TY, TZ}`.
-"""
-abstract type AbstractUnderlyingGrid{FT, TX, TY, TZ, Arch} <: AbstractGrid{FT, TX, TY, TZ, Arch} end
-
-"""
-    AbstractCurvilinearGrid{FT, TX, TY, TZ}
-
-Abstract supertype for curvilinear grids with elements of type `FT` and topology `{TX, TY, TZ}`.
-"""
-abstract type AbstractCurvilinearGrid{FT, TX, TY, TZ, Arch} <: AbstractUnderlyingGrid{FT, TX, TY, TZ, Arch} end
-
-"""
-    AbstractHorizontallyCurvilinearGrid{FT, TX, TY, TZ}
-
-Abstract supertype for horizontally-curvilinear grids with elements of type `FT` and topology `{TX, TY, TZ}`.
-"""
-abstract type AbstractHorizontallyCurvilinearGrid{FT, TX, TY, TZ, Arch} <: AbstractCurvilinearGrid{FT, TX, TY, TZ, Arch} end
-
 #####
 ##### Directions (for tilted domains)
 #####
@@ -144,20 +115,7 @@ struct ZDirection <: AbstractDirection end
 
 struct NegativeZDirection <: AbstractDirection end
 
-const XFlatGrid = AbstractGrid{<:Any, Flat}
-const YFlatGrid = AbstractGrid{<:Any, <:Any, Flat}
-const ZFlatGrid = AbstractGrid{<:Any, <:Any, <:Any, Flat}
-
-const XYFlatGrid = AbstractGrid{<:Any, Flat, Flat}
-const XZFlatGrid = AbstractGrid{<:Any, Flat, <:Any, Flat}
-const YZFlatGrid = AbstractGrid{<:Any, <:Any, Flat, Flat}
-
-const XYZFlatGrid = AbstractGrid{<:Any, Flat, Flat, Flat}
-
-isrectilinear(grid) = false
-@inline active_surface_map(::AbstractGrid) = nothing
-@inline active_interior_map(::AbstractGrid) = nothing
-
+include("abstract_grid.jl")
 include("grid_utils.jl")
 include("nodes_and_spacings.jl")
 include("zeros_and_ones.jl")
diff --git a/src/Grids/abstract_grid.jl b/src/Grids/abstract_grid.jl
new file mode 100644
index 0000000000..92b0c3f21b
--- /dev/null
+++ b/src/Grids/abstract_grid.jl
@@ -0,0 +1,101 @@
+"""
+    AbstractGrid{FT, TX, TY, TZ}
+
+Abstract supertype for grids with elements of type `FT` and topology `{TX, TY, TZ}`.
+"""
+abstract type AbstractGrid{FT, TX, TY, TZ, Arch} end
+
+"""
+    AbstractUnderlyingGrid{FT, TX, TY, TZ}
+
+Abstract supertype for "primary" grids (as opposed to grids with immersed boundaries)
+with elements of type `FT` and topology `{TX, TY, TZ}`.
+"""
+abstract type AbstractUnderlyingGrid{FT, TX, TY, TZ, Arch} <: AbstractGrid{FT, TX, TY, TZ, Arch} end
+
+"""
+    AbstractCurvilinearGrid{FT, TX, TY, TZ}
+
+Abstract supertype for curvilinear grids with elements of type `FT` and topology `{TX, TY, TZ}`.
+"""
+abstract type AbstractCurvilinearGrid{FT, TX, TY, TZ, Arch} <: AbstractUnderlyingGrid{FT, TX, TY, TZ, Arch} end
+
+"""
+    AbstractHorizontallyCurvilinearGrid{FT, TX, TY, TZ}
+
+Abstract supertype for horizontally-curvilinear grids with elements of type `FT` and topology `{TX, TY, TZ}`.
+"""
+abstract type AbstractHorizontallyCurvilinearGrid{FT, TX, TY, TZ, Arch} <: AbstractCurvilinearGrid{FT, TX, TY, TZ, Arch} end
+
+const XFlatGrid = AbstractGrid{<:Any, Flat}
+const YFlatGrid = AbstractGrid{<:Any, <:Any, Flat}
+const ZFlatGrid = AbstractGrid{<:Any, <:Any, <:Any, Flat}
+
+const XYFlatGrid = AbstractGrid{<:Any, Flat, Flat}
+const XZFlatGrid = AbstractGrid{<:Any, Flat, <:Any, Flat}
+const YZFlatGrid = AbstractGrid{<:Any, <:Any, Flat, Flat}
+
+const XYZFlatGrid = AbstractGrid{<:Any, Flat, Flat, Flat}
+
+isrectilinear(grid) = false
+
+# Fallback
+@inline  retrieve_surface_active_cells_map(::AbstractGrid) = nothing
+@inline retrieve_interior_active_cells_map(::AbstractGrid, any_map_type) = nothing
+
+"""
+    topology(grid)
+
+Return a tuple with the topology of the `grid` for each dimension.
+"""
+@inline topology(::AbstractGrid{FT, TX, TY, TZ}) where {FT, TX, TY, TZ} = (TX, TY, TZ)
+
+"""
+    topology(grid, dim)
+
+Return the topology of the `grid` for the `dim`-th dimension.
+"""
+@inline topology(grid, dim) = topology(grid)[dim]
+
+"""
+    architecture(grid::AbstractGrid)
+
+Return the architecture (CPU or GPU) that the `grid` lives on.
+"""
+@inline architecture(grid::AbstractGrid) = grid.architecture
+
+"""
+    size(grid)
+
+Return a 3-tuple of the number of "center" cells on a grid in (x, y, z).
+Center cells have the location (Center, Center, Center).
+"""
+@inline Base.size(grid::AbstractGrid) = (grid.Nx, grid.Ny, grid.Nz)
+Base.eltype(::AbstractGrid{FT}) where FT = FT
+Base.eps(::AbstractGrid{FT}) where FT = eps(FT)
+
+function Base.:(==)(grid1::AbstractGrid, grid2::AbstractGrid)
+    #check if grids are of the same type
+    !isa(grid2, typeof(grid1).name.wrapper) && return false
+
+    topology(grid1) !== topology(grid2) && return false
+
+    x1, y1, z1 = nodes(grid1, (Face(), Face(), Face()))
+    x2, y2, z2 = nodes(grid2, (Face(), Face(), Face()))
+
+    CUDA.@allowscalar return x1 == x2 && y1 == y2 && z1 == z2
+end
+
+"""
+    halo_size(grid)
+
+Return a 3-tuple with the number of halo cells on either side of the
+domain in (x, y, z).
+"""
+halo_size(grid) = (grid.Hx, grid.Hy, grid.Hz)
+halo_size(grid, d) = halo_size(grid)[d]
+
+@inline Base.size(grid::AbstractGrid, d::Int) = size(grid)[d]
+
+grid_name(grid::AbstractGrid) = typeof(grid).name.wrapper
+
diff --git a/src/Grids/automatic_halo_sizing.jl b/src/Grids/automatic_halo_sizing.jl
index bfc6870923..a4fd19b526 100644
--- a/src/Grids/automatic_halo_sizing.jl
+++ b/src/Grids/automatic_halo_sizing.jl
@@ -19,6 +19,7 @@ required_halo_size(CenteredFourthOrder())
 function required_halo_size end
 
 required_halo_size(tendency_term) = 1
+required_halo_size(::Nothing) = 0
 
 inflate_halo_size_one_dimension(req_H, old_H, _, grid)            = max(req_H, old_H)
 inflate_halo_size_one_dimension(req_H, old_H, ::Type{Flat}, grid) = 0
diff --git a/src/Grids/grid_generation.jl b/src/Grids/grid_generation.jl
index 056c4fb7af..b0ba1ad7dc 100644
--- a/src/Grids/grid_generation.jl
+++ b/src/Grids/grid_generation.jl
@@ -3,6 +3,7 @@ get_domain_extent(::Nothing, N)             = (1, 1)
 get_domain_extent(coord, N)                 = (coord[1], coord[2])
 get_domain_extent(coord::Function, N)       = (coord(1), coord(N+1))
 get_domain_extent(coord::AbstractVector, N) = CUDA.@allowscalar (coord[1], coord[N+1])
+get_domain_extent(coord::Number, N)         = (coord, coord)
 
 get_face_node(coord::Nothing, i) = 1
 get_face_node(coord::Function, i) = coord(i)
@@ -119,5 +120,13 @@ function generate_coordinate(FT, topo::AT, N, H, node_interval::Tuple{<:Number,
 end
 
 # Flat domains
-generate_coordinate(FT, ::Flat, N, H, coord::Tuple{<:Number, <:Number}, coordinate_name, arch) =
-    FT(1), range(1, 1, length=N), range(1, 1, length=N), FT(1), FT(1)
+generate_coordinate(FT, ::Flat, N, H, c::Number, coordinate_name, arch) =
+    FT(1), range(FT(c), FT(c), length=N), range(FT(c), FT(c), length=N), FT(1), FT(1)
+
+# What's the use case for this?
+# generate_coordinate(FT, ::Flat, N, H, c::Tuple{Number, Number}, coordinate_name, arch) =
+#     FT(1), c, c, FT(1), FT(1)
+
+generate_coordinate(FT, ::Flat, N, H, ::Nothing, coordinate_name, arch) =
+    FT(1), nothing, nothing, FT(1), FT(1)
+
diff --git a/src/Grids/grid_utils.jl b/src/Grids/grid_utils.jl
index b71a2855fd..c881ebeb86 100644
--- a/src/Grids/grid_utils.jl
+++ b/src/Grids/grid_utils.jl
@@ -4,6 +4,34 @@ using Base.Ryu: writeshortest
 using LinearAlgebra: dot, cross
 using OffsetArrays: IdOffsetRange
 
+"""
+    _property(ξ, T, ℓ, N, with_halos=false)
+
+Return the grid property `ξ`, either `with_halos` or without,
+for topology `T`, (instantiated) location `ℓ`, and dimension length `N`.
+"""
+@inline function _property(ξ, ℓ, T, N, with_halos)
+    if with_halos
+        return ξ
+    else
+        i = interior_indices(ℓ, T(), N)
+        return view(ξ, i)
+    end
+end
+
+@inline function _property(ξ, ℓx, ℓy, Tx, Ty, Nx, Ny, with_halos)
+    if with_halos
+        return ξ
+    else
+        i = interior_indices(ℓx, Tx(), Nx)
+        j = interior_indices(ℓy, Ty(), Ny)
+        return view(ξ, i, j)
+    end
+end
+
+@inline _property(ξ::Number, args...) = ξ
+@inline _property(::Nothing, args...) = nothing
+
 # Define default indices in a type-stable way
 @inline default_indices(N::Int) = default_indices(Val(N))
 
@@ -15,42 +43,6 @@ using OffsetArrays: IdOffsetRange
 end
 
 const BoundedTopology = Union{Bounded, LeftConnected}
-
-"""
-    topology(grid)
-
-Return a tuple with the topology of the `grid` for each dimension.
-"""
-@inline topology(::AbstractGrid{FT, TX, TY, TZ}) where {FT, TX, TY, TZ} = (TX, TY, TZ)
-
-"""
-    topology(grid, dim)
-
-Return the topology of the `grid` for the `dim`-th dimension.
-"""
-@inline topology(grid, dim) = topology(grid)[dim]
-
-"""
-    architecture(grid::AbstractGrid)
-
-Return the architecture (CPU or GPU) that the `grid` lives on.
-"""
-@inline architecture(grid::AbstractGrid) = grid.architecture
-
-Base.eltype(::AbstractGrid{FT}) where FT = FT
-
-function Base.:(==)(grid1::AbstractGrid, grid2::AbstractGrid)
-    #check if grids are of the same type
-    !isa(grid2, typeof(grid1).name.wrapper) && return false
-
-    topology(grid1) !== topology(grid2) && return false
-
-    x1, y1, z1 = nodes(grid1, (Face(), Face(), Face()))
-    x2, y2, z2 = nodes(grid2, (Face(), Face(), Face()))
-
-    CUDA.@allowscalar return x1 == x2 && y1 == y2 && z1 == z2
-end
-
 const AT = AbstractTopology
 
 Base.length(::Face,    ::BoundedTopology, N) = N + 1
@@ -65,40 +57,6 @@ Base.length(::Center,  ::Flat,            N) = N
 Base.length(loc, topo::AT, N, ::Colon) = length(loc, topo, N)
 Base.length(loc, topo::AT, N, ind::UnitRange) = min(length(loc, topo, N), length(ind))
 
-"""
-    size(grid)
-
-Return a 3-tuple of the number of "center" cells on a grid in (x, y, z).
-Center cells have the location (Center, Center, Center).
-"""
-@inline Base.size(grid::AbstractGrid) = (grid.Nx, grid.Ny, grid.Nz)
-
-"""
-    halo_size(grid)
-
-Return a 3-tuple with the number of halo cells on either side of the
-domain in (x, y, z).
-"""
-halo_size(grid) = (grid.Hx, grid.Hy, grid.Hz)
-halo_size(grid, d) = halo_size(grid)[d]
-
-@inline Base.size(grid::AbstractGrid, d::Int) = size(grid)[d]
-
-@inline Base.size(grid::AbstractGrid, loc::Tuple, indices=default_indices(Val(length(loc)))) =
-    size(loc, topology(grid), size(grid), indices)
-
-@inline function Base.size(loc, topo, sz, indices=default_indices(Val(length(loc))))
-    D = length(loc)
-
-    # (it's type stable?)
-    return ntuple(Val(D)) do d
-        Base.@_inline_meta
-        length(instantiate(loc[d]), instantiate(topo[d]), sz[d], indices[d])
-    end
-end
-
-Base.size(grid::AbstractGrid, loc::Tuple, d::Int) = size(grid, loc)[d]
-
 """
     total_length(loc, topo, N, H=0, ind=Colon())
 
@@ -119,6 +77,21 @@ total_length(::Center,  ::Flat,            N, H=0) = N
 total_length(loc, topo, N, H, ::Colon) = total_length(loc, topo, N, H)
 total_length(loc, topo, N, H, ind::UnitRange) = min(total_length(loc, topo, N, H), length(ind))
 
+@inline Base.size(grid::AbstractGrid, loc::Tuple, indices=default_indices(Val(length(loc)))) =
+    size(loc, topology(grid), size(grid), indices)
+
+@inline function Base.size(loc, topo, sz, indices=default_indices(Val(length(loc))))
+    D = length(loc)
+
+    # (it's type stable?)
+    return ntuple(Val(D)) do d
+        Base.@_inline_meta
+        length(instantiate(loc[d]), instantiate(topo[d]), sz[d], indices[d])
+    end
+end
+
+Base.size(grid::AbstractGrid, loc::Tuple, d::Int) = size(grid, loc)[d]
+
 total_size(a) = size(a) # fallback
 
 """
@@ -147,7 +120,9 @@ constant grid spacing `Δ`, and interior extent `L`.
 
 # Grid domains
 @inline domain(topo, N, ξ) = CUDA.@allowscalar ξ[1], ξ[N+1]
-@inline domain(::Flat, N, ξ) = CUDA.@allowscalar ξ[1], ξ[1]
+@inline domain(::Flat, N, ξ::AbstractArray) = ξ[1]
+@inline domain(::Flat, N, ξ::Number) = ξ
+@inline domain(::Flat, N, ::Nothing) = nothing
 
 @inline x_domain(grid) = domain(topology(grid, 1)(), grid.Nx, grid.xᶠᵃᵃ)
 @inline y_domain(grid) = domain(topology(grid, 2)(), grid.Ny, grid.yᵃᶠᵃ)
@@ -231,6 +206,9 @@ parent_index_range(view_indices::UnitRange, loc, topo, halo) = view_indices .+ i
 # Return the index range of parent arrays that are themselves windowed
 parent_index_range(::Colon, args...) = parent_index_range(args...)
 
+parent_index_range(parent_indices::UnitRange, ::Colon, args...) =
+    parent_index_range(parent_indices, parent_indices, args...)
+
 function parent_index_range(parent_indices::UnitRange, view_indices, args...)
     start = first(view_indices) - first(parent_indices) + 1
     stop = start + length(view_indices) - 1
@@ -239,12 +217,9 @@ end
 
 # intersect_index_range(::Colon, ::Colon) = Colon()
 index_range_contains(range,   subset::UnitRange) = (first(subset) ∈ range) & (last(subset) ∈ range)
-index_range_contains(::Colon, subset::UnitRange) = true
+index_range_contains(::Colon, ::UnitRange)       = true
 index_range_contains(::Colon, ::Colon)           = true
-
-# Note: this choice means subset indices are defined on the whole grid.
-# Thus any UnitRange does not contain `:`.
-index_range_contains(range::UnitRange, subset::Colon) = false 
+index_range_contains(::UnitRange, ::Colon)       = true
 
 # Return the index range of "full" parent arrays that span an entire dimension
 parent_windowed_indices(::Colon, loc, topo, halo)            = Colon()
@@ -305,9 +280,11 @@ Base.show(io::IO, dir::AbstractDirection) = print(io, summary(dir))
 
 size_summary(sz) = string(sz[1], "×", sz[2], "×", sz[3])
 prettysummary(σ::AbstractFloat, plus=false) = writeshortest(σ, plus, false, true, -1, UInt8('e'), false, UInt8('.'), false, true)
-dimension_summary(topo::Flat, name, args...) = "Flat $name"
 
-function domain_summary(topo, name, left, right)
+domain_summary(topo::Flat, name, ::Nothing) = "Flat $name"
+domain_summary(topo::Flat, name, coord::Number) = "Flat $name = $coord"
+
+function domain_summary(topo, name, (left, right))
     interval = (topo isa Bounded) ||
                (topo isa LeftConnected) ? "]" : ")"
 
@@ -315,22 +292,24 @@ function domain_summary(topo, name, left, right)
                   topo isa Bounded ? "Bounded  " :
                   topo isa FullyConnected ? "FullyConnected " :
                   topo isa LeftConnected ? "LeftConnected  " :
-                  "RightConnected "
+                  topo isa RightConnected ? "RightConnected  " :
+                  error("Unexpected topology $topo together with the domain end points ($left, $right)")
 
     return string(topo_string, name, " ∈ [",
                   prettysummary(left), ", ",
                   prettysummary(right), interval)
 end
 
-function dimension_summary(topo, name, left, right, spacing, pad_domain=0)
-    prefix = domain_summary(topo, name, left, right)
+function dimension_summary(topo, name, dom, spacing, pad_domain=0)
+    prefix = domain_summary(topo, name, dom)
     padding = " "^(pad_domain+1) 
-    return string(prefix, padding, coordinate_summary(spacing, name))
+    return string(prefix, padding, coordinate_summary(topo, spacing, name))
 end
 
-coordinate_summary(Δ::Number, name) = @sprintf("regularly spaced with Δ%s=%s", name, prettysummary(Δ))
+coordinate_summary(::Flat, Δ::Number, name) = ""
+coordinate_summary(topo, Δ::Number, name) = @sprintf("regularly spaced with Δ%s=%s", name, prettysummary(Δ))
 
-coordinate_summary(Δ::Union{AbstractVector, AbstractMatrix}, name) =
+coordinate_summary(topo, Δ::Union{AbstractVector, AbstractMatrix}, name) =
     @sprintf("variably spaced with min(Δ%s)=%s, max(Δ%s)=%s",
              name, prettysummary(minimum(parent(Δ))),
              name, prettysummary(maximum(parent(Δ))))
@@ -516,5 +495,3 @@ function add_halos(data::AbstractArray{FT, 2} where FT, loc, topo, sz, halo_sz;
     return add_halos(reshape(data, (Nx, Ny, 1)), loc, topo, sz, halo_sz; warnings)
 end
 
-grid_name(grid::AbstractGrid) = typeof(grid).name.wrapper
-
diff --git a/src/Grids/input_validation.jl b/src/Grids/input_validation.jl
index 5007b1b55e..9624086bba 100644
--- a/src/Grids/input_validation.jl
+++ b/src/Grids/input_validation.jl
@@ -9,9 +9,9 @@ using Oceananigans: tupleit
 "adds tup element with `default` value for flat dimensions"
 inflate_tuple(TX, TY, TZ, tup; default) = tup
 
-inflate_tuple(::Type{Flat}, TY, TZ, tup; default) = tuple(default, tup[1], tup[2])
-inflate_tuple(TY, ::Type{Flat}, TZ, tup; default) = tuple(tup[1], default, tup[2])
-inflate_tuple(TY, TZ, ::Type{Flat}, tup; default) = tuple(tup[1], tup[2], default)
+inflate_tuple(::Type{Flat}, TY, TZ, tup; default) = (default, tup[1], tup[2])
+inflate_tuple(TY, ::Type{Flat}, TZ, tup; default) = (tup[1], default, tup[2])
+inflate_tuple(TY, TZ, ::Type{Flat}, tup; default) = (tup[1], tup[2], default)
 
 inflate_tuple(TX, ::Type{Flat}, ::Type{Flat}, tup; default) = (tup[1], default, default)
 inflate_tuple(::Type{Flat}, TY, ::Type{Flat}, tup; default) = (default, tup[1], default)
@@ -109,15 +109,13 @@ function validate_rectilinear_domain(TX, TY, TZ, FT, size, extent, x, y, z)
             throw(ArgumentError("Cannot specify both 'extent' and 'x, y, z' keyword arguments."))
 
         extent = tupleit(extent)
-
         validate_tupled_argument(extent, Number, "extent", topological_tuple_length(TX, TY, TZ))
-
         Lx, Ly, Lz = extent = inflate_tuple(TX, TY, TZ, extent, default=0)
 
         # An "oceanic" default domain:
-        x = FT.((0, Lx))
-        y = FT.((0, Ly))
-        z = FT.((-Lz, 0))
+        x = TX() isa Flat ? nothing : (zero(FT), convert(FT, Lx))
+        y = TY() isa Flat ? nothing : (zero(FT), convert(FT, Ly))
+        z = TZ() isa Flat ? nothing : (-convert(FT, Lz), zero(FT))
 
     else # isnothing(extent) === true implies that user has not specified a length
         x = validate_dimension_specification(TX, x, :x, size[1], FT)
@@ -154,45 +152,13 @@ end
 
 validate_dimension_specification(::Type{Flat}, ξ::AbstractVector, dir, N, FT) = (FT(ξ[1]), FT(ξ[1]))
 validate_dimension_specification(::Type{Flat}, ξ::Function,       dir, N, FT) = (FT(ξ(1)), FT(ξ(1)))
-validate_dimension_specification(::Type{Flat}, ξ::Tuple,  dir, N, FT) = FT.(ξ)
-validate_dimension_specification(::Type{Flat}, ::Nothing, dir, N, FT) = (zero(FT), zero(FT))
-validate_dimension_specification(::Type{Flat}, ξ::Number, dir, N, FT) = (FT(ξ), FT(ξ))
+validate_dimension_specification(::Type{Flat}, ξ::Tuple,  dir, N, FT) = map(FT, ξ)
+validate_dimension_specification(::Type{Flat}, ::Nothing, dir, N, FT) = nothing
+validate_dimension_specification(::Type{Flat}, ξ::Number, dir, N, FT) = convert(FT, ξ)
 
 default_horizontal_extent(T, extent) = (0, extent[i])
 default_vertical_extent(T, extent) = (-extent[3], 0)
 
-function validate_regular_grid_domain(TX, TY, TZ, FT, extent, x, y, z)
-
-    # Find domain endpoints or domain extent, depending on user input:
-    if !isnothing(extent) # the user has specified an extent!
-
-        (!isnothing(x) || !isnothing(y) || !isnothing(z)) &&
-            throw(ArgumentError("Cannot specify both 'extent' and 'x, y, z' keyword arguments."))
-
-        extent = tupleit(extent)
-
-        validate_tupled_argument(extent, Number, "extent", topological_tuple_length(TX, TY, TZ))
-
-        Lx, Ly, Lz = extent = inflate_tuple(TX, TY, TZ, extent, default=0)
-
-        # An "oceanic" default domain:
-        x = (0, Lx)
-        y = (0, Ly)
-        z = (-Lz, 0)
-
-    else # isnothing(extent) === true implies that user has not specified a length
-        x = validate_dimension_specification(TX, x, :x, FT)
-        y = validate_dimension_specification(TY, y, :y, FT)
-        z = validate_dimension_specification(TZ, z, :z, FT)
-
-        Lx = x[2] - x[1]
-        Ly = y[2] - y[1]
-        Lz = z[2] - z[1]
-    end
-
-    return FT(Lx), FT(Ly), FT(Lz), FT.(x), FT.(y), FT.(z)
-end
-
 function validate_vertically_stretched_grid_xy(TX, TY, FT, x, y)
     x = validate_dimension_specification(TX, x, :x, FT)
     y = validate_dimension_specification(TY, y, :y, FT)
@@ -238,3 +204,4 @@ validate_indices(indices, loc, grid::AbstractGrid) =
 
 validate_indices(indices, loc, topo, sz, halo_sz) =
     map(validate_index, indices, map(instantiate, loc), map(instantiate, topo), sz, halo_sz)
+
diff --git a/src/Grids/latitude_longitude_grid.jl b/src/Grids/latitude_longitude_grid.jl
index accb4b78bb..ebca324492 100644
--- a/src/Grids/latitude_longitude_grid.jl
+++ b/src/Grids/latitude_longitude_grid.jl
@@ -210,7 +210,11 @@ function LatitudeLongitudeGrid(architecture::AbstractArchitecture = CPU(),
                                                          Δzᵃᵃᶠ, Δzᵃᵃᶜ, zᵃᵃᶠ, zᵃᵃᶜ,
                                                          (nothing for i=1:10)..., FT(radius))
 
-    return !precompute_metrics ? preliminary_grid : with_precomputed_metrics(preliminary_grid)
+    if !precompute_metrics
+        return preliminary_grid
+    else
+        return with_precomputed_metrics(preliminary_grid)
+    end
 end
 
 # architecture = CPU() default, assuming that a DataType positional arg
@@ -221,9 +225,18 @@ LatitudeLongitudeGrid(FT::DataType; kwargs...) = LatitudeLongitudeGrid(CPU(), FT
 function with_precomputed_metrics(grid)
     Δxᶠᶜᵃ, Δxᶜᶠᵃ, Δxᶠᶠᵃ, Δxᶜᶜᵃ, Δyᶠᶜᵃ, Δyᶜᶠᵃ, Azᶠᶜᵃ, Azᶜᶠᵃ, Azᶠᶠᵃ, Azᶜᶜᵃ = allocate_metrics(grid)
 
-    precompute_curvilinear_metrics!(grid, Δxᶠᶜᵃ, Δxᶜᶠᵃ, Δxᶠᶠᵃ, Δxᶜᶜᵃ, Azᶠᶜᵃ, Azᶜᶠᵃ, Azᶠᶠᵃ, Azᶜᶜᵃ)
+    # Compute Δx's and areas
+    arch = grid.architecture
+    dev = Architectures.device(arch)
+    workgroup, worksize  = metric_workgroup(grid), metric_worksize(grid)
+    loop! = compute_Δx_Az!(dev, workgroup, worksize)
+    loop!(grid, Δxᶠᶜᵃ, Δxᶜᶠᵃ, Δxᶠᶠᵃ, Δxᶜᶜᵃ, Azᶠᶜᵃ, Azᶜᶠᵃ, Azᶠᶠᵃ, Azᶜᶜᵃ)
 
-    Δyᶠᶜᵃ, Δyᶜᶠᵃ = precompute_Δy_metrics(grid, Δyᶠᶜᵃ, Δyᶜᶠᵃ)
+    # Compute Δy's if needed
+    if !(grid isa YRegularLLG)
+        loop! = compute_Δy!(dev, 16, length(grid.Δφᵃᶜᵃ) - 1)
+        loop!(grid, Δyᶠᶜᵃ, Δyᶜᶠᵃ)
+    end
 
     Nλ, Nφ, Nz = size(grid)
     Hλ, Hφ, Hz = halo_size(grid)
@@ -254,6 +267,10 @@ function validate_lat_lon_grid_args(topology, size, halo, FT, latitude, longitud
         TZ = Bounded
     end
 
+    if TY() isa Periodic
+        throw(ArgumentError("LatitudeLongitudeGrid cannot be Periodic in latitude!"))
+    end
+
     # Validate longitude and latitude
     λ₁, λ₂ = get_domain_extent(longitude, Nλ)
     λ₂ - λ₁ ≤ 360 || throw(ArgumentError("Longitudinal extent cannot be greater than 360 degrees."))
@@ -292,19 +309,19 @@ end
 function Base.show(io::IO, grid::LatitudeLongitudeGrid, withsummary=true)
     TX, TY, TZ = topology(grid)
 
-    λ₁, λ₂ = domain(TX(), size(grid, 1), grid.λᶠᵃᵃ)
-    φ₁, φ₂ = domain(TY(), size(grid, 2), grid.φᵃᶠᵃ)
-    z₁, z₂ = domain(TZ(), size(grid, 3), grid.zᵃᵃᶠ)
+    Ωλ = domain(TX(), size(grid, 1), grid.λᶠᵃᵃ)
+    Ωφ = domain(TY(), size(grid, 2), grid.φᵃᶠᵃ)
+    Ωz = domain(TZ(), size(grid, 3), grid.zᵃᵃᶠ)
 
-    x_summary = domain_summary(TX(), "λ", λ₁, λ₂)
-    y_summary = domain_summary(TY(), "φ", φ₁, φ₂)
-    z_summary = domain_summary(TZ(), "z", z₁, z₂)
+    x_summary = domain_summary(TX(), "λ", Ωλ) 
+    y_summary = domain_summary(TY(), "φ", Ωφ)
+    z_summary = domain_summary(TZ(), "z", Ωz)
 
     longest = max(length(x_summary), length(y_summary), length(z_summary))
 
-    x_summary = "longitude: " * dimension_summary(TX(), "λ", λ₁, λ₂, grid.Δλᶜᵃᵃ, longest - length(x_summary))
-    y_summary = "latitude:  " * dimension_summary(TY(), "φ", φ₁, φ₂, grid.Δφᵃᶜᵃ, longest - length(y_summary))
-    z_summary = "z:         " * dimension_summary(TZ(), "z", z₁, z₂, grid.Δzᵃᵃᶜ, longest - length(z_summary))
+    x_summary = "longitude: " * dimension_summary(TX(), "λ", Ωλ, grid.Δλᶜᵃᵃ, longest - length(x_summary))
+    y_summary = "latitude:  " * dimension_summary(TY(), "φ", Ωφ, grid.Δφᵃᶜᵃ, longest - length(y_summary))
+    z_summary = "z:         " * dimension_summary(TZ(), "z", Ωz, grid.Δzᵃᵃᶜ, longest - length(z_summary))
 
     if withsummary
         print(io, summary(grid), "\n")
@@ -323,47 +340,60 @@ end
 @inline cpu_face_constructor_y(grid::YRegularLLG) = y_domain(grid)
 @inline cpu_face_constructor_z(grid::ZRegularLLG) = z_domain(grid)
 
-function with_halo(new_halo, old_grid::LatitudeLongitudeGrid)
-
-    size = (old_grid.Nx, old_grid.Ny, old_grid.Nz)
-    topo = topology(old_grid)
-
-    x = cpu_face_constructor_x(old_grid)
-    y = cpu_face_constructor_y(old_grid)
-    z = cpu_face_constructor_z(old_grid)
-
-    # Remove elements of size and new_halo in Flat directions as expected by grid
-    # constructor
-    size     = pop_flat_elements(size, topo)
-    new_halo = pop_flat_elements(new_halo, topo)
-
-    new_grid = LatitudeLongitudeGrid(architecture(old_grid), eltype(old_grid);
-                                     size = size, halo = new_halo,
-                                     longitude = x, latitude = y, z = z, topology = topo,
-                                     precompute_metrics = metrics_precomputed(old_grid),
-                                     radius = old_grid.radius)
+function constructor_arguments(grid::LatitudeLongitudeGrid)
+    arch = architecture(grid)
+    FT = eltype(grid)
+    args = Dict(:architecture => arch, :number_type => eltype(grid))
+
+    # Kwargs
+    topo = topology(grid)
+    size = (grid.Nx, grid.Ny, grid.Nz)
+    halo = (grid.Hx, grid.Hy, grid.Hz)
+    size = pop_flat_elements(size, topo)
+    halo = pop_flat_elements(halo, topo)
+
+    kwargs = Dict(:size => size,
+                  :halo => halo,
+                  :longitude => cpu_face_constructor_x(grid),
+                  :latitude => cpu_face_constructor_y(grid),
+                  :z => cpu_face_constructor_z(grid),
+                  :topology => topo,
+                  :radius => grid.radius,
+                  :precompute_metrics => metrics_precomputed(grid))
+
+    return args, kwargs
+end
 
-    return new_grid
+function Base.similar(grid::LatitudeLongitudeGrid)
+    args, kwargs = constructor_arguments(grid)
+    arch = args[:architecture]
+    FT = args[:number_type]
+    return LatitudeLongitudeGrid(arch, FT; kwargs...)
 end
 
-function on_architecture(new_arch::AbstractSerialArchitecture, old_grid::LatitudeLongitudeGrid)
-    old_properties = (old_grid.Δλᶠᵃᵃ, old_grid.Δλᶜᵃᵃ, old_grid.λᶠᵃᵃ,  old_grid.λᶜᵃᵃ,
-                      old_grid.Δφᵃᶠᵃ, old_grid.Δφᵃᶜᵃ, old_grid.φᵃᶠᵃ,  old_grid.φᵃᶜᵃ,
-                      old_grid.Δzᵃᵃᶠ, old_grid.Δzᵃᵃᶜ, old_grid.zᵃᵃᶠ,  old_grid.zᵃᵃᶜ,
-                      old_grid.Δxᶠᶜᵃ, old_grid.Δxᶜᶠᵃ, old_grid.Δxᶠᶠᵃ, old_grid.Δxᶜᶜᵃ,
-                      old_grid.Δyᶠᶜᵃ, old_grid.Δyᶜᶠᵃ,
-                      old_grid.Azᶠᶜᵃ, old_grid.Azᶜᶠᵃ, old_grid.Azᶠᶠᵃ, old_grid.Azᶜᶜᵃ)
+function with_number_type(FT, grid::LatitudeLongitudeGrid)
+    args, kwargs = constructor_arguments(grid)
+    arch = args[:architecture]
+    return LatitudeLongitudeGrid(arch, FT; kwargs...)
+end
 
-    new_properties = Tuple(on_architecture(new_arch, p) for p in old_properties)
+function with_halo(halo, grid::LatitudeLongitudeGrid)
+    args, kwargs = constructor_arguments(grid)
+    halo = pop_flat_elements(halo, topology(grid))
+    kwargs[:halo] = halo
+    arch = args[:architecture]
+    FT = args[:number_type]
+    return LatitudeLongitudeGrid(arch, FT; kwargs...)
+end
 
-    TX, TY, TZ = topology(old_grid)
+function on_architecture(arch::AbstractSerialArchitecture, grid::LatitudeLongitudeGrid)
+    if arch == architecture(grid)
+        return grid
+    end
 
-    return LatitudeLongitudeGrid{TX, TY, TZ}(new_arch,
-                                             old_grid.Nx, old_grid.Ny, old_grid.Nz,
-                                             old_grid.Hx, old_grid.Hy, old_grid.Hz,
-                                             old_grid.Lx, old_grid.Ly, old_grid.Lz,
-                                             new_properties...,
-                                             old_grid.radius)
+    args, kwargs = constructor_arguments(grid)
+    FT = args[:number_type]
+    return LatitudeLongitudeGrid(arch, FT; kwargs...)
 end
 
 function Adapt.adapt_structure(to, grid::LatitudeLongitudeGrid)
@@ -443,19 +473,7 @@ end
 @inline metric_worksize(grid::XRegularLLG)  = length(grid.φᵃᶠᵃ) - 2 
 @inline metric_workgroup(grid::XRegularLLG) = 16
 
-function precompute_curvilinear_metrics!(grid, Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ, Δxᶜᶜ, Azᶠᶜ, Azᶜᶠ, Azᶠᶠ, Azᶜᶜ)
-    
-    arch = grid.architecture
-
-    workgroup, worksize  = metric_workgroup(grid), metric_worksize(grid)
-    curvilinear_metrics! = precompute_metrics_kernel!(Architectures.device(arch), workgroup, worksize)
-
-    curvilinear_metrics!(grid, Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ, Δxᶜᶜ, Azᶠᶜ, Azᶜᶠ, Azᶠᶠ, Azᶜᶜ)
-
-    return nothing
-end
-
-@kernel function precompute_metrics_kernel!(grid::LatitudeLongitudeGrid, Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ, Δxᶜᶜ, Azᶠᶜ, Azᶜᶠ, Azᶠᶠ, Azᶜᶜ)
+@kernel function compute_Δx_Az!(grid::LatitudeLongitudeGrid, Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ, Δxᶜᶜ, Azᶠᶜ, Azᶜᶠ, Azᶠᶠ, Azᶜᶜ)
     i, j = @index(Global, NTuple)
 
     # Manually offset x- and y-index
@@ -474,7 +492,7 @@ end
     end
 end
 
-@kernel function precompute_metrics_kernel!(grid::XRegularLLG, Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ, Δxᶜᶜ, Azᶠᶜ, Azᶜᶠ, Azᶠᶠ, Azᶜᶜ)
+@kernel function compute_Δx_Az!(grid::XRegularLLG, Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ, Δxᶜᶜ, Azᶠᶜ, Azᶜᶠ, Azᶠᶠ, Azᶜᶜ)
     j = @index(Global, Linear)
 
     # Manually offset y-index
@@ -496,21 +514,7 @@ end
 ##### Kernels that precompute the y-metric
 #####
 
-function precompute_Δy_metrics(grid::LatitudeLongitudeGrid, Δyᶠᶜ, Δyᶜᶠ)
-    arch = grid.architecture
-    precompute_Δy! = precompute_Δy_kernel!(Architectures.device(arch), 16, length(grid.Δφᵃᶜᵃ) - 1)
-    precompute_Δy!(grid, Δyᶠᶜ, Δyᶜᶠ)
-    
-    return Δyᶠᶜ, Δyᶜᶠ
-end
-
-function  precompute_Δy_metrics(grid::YRegularLLG, Δyᶠᶜ, Δyᶜᶠ)
-    Δyᶜᶠ =  Δyᶜᶠᵃ(1, 1, 1, grid)
-    Δyᶠᶜ =  Δyᶠᶜᵃ(1, 1, 1, grid)
-    return Δyᶠᶜ, Δyᶜᶠ
-end
-
-@kernel function precompute_Δy_kernel!(grid, Δyᶠᶜ, Δyᶜᶠ)
+@kernel function compute_Δy!(grid, Δyᶠᶜ, Δyᶜᶠ)
     j = @index(Global, Linear)
 
     # Manually offset y-index
@@ -528,16 +532,6 @@ end
 
 function allocate_metrics(grid::LatitudeLongitudeGrid)
     FT = eltype(grid)
-    
-    # preallocate quantities to ensure correct type and size
-    grid_metrics = (:Δxᶠᶜ,
-                    :Δxᶜᶠ,
-                    :Δxᶠᶠ,
-                    :Δxᶜᶜ,
-                    :Azᶠᶜ,
-                    :Azᶜᶠ,
-                    :Azᶠᶠ,
-                    :Azᶜᶜ)
 
     arch = grid.architecture
     
@@ -546,18 +540,21 @@ function allocate_metrics(grid::LatitudeLongitudeGrid)
         metric_size = length(grid.φᵃᶜᵃ)
     else
         offsets     = (grid.Δλᶜᵃᵃ.offsets[1], grid.φᵃᶜᵃ.offsets[1])
-        metric_size = (length(grid.Δλᶜᵃᵃ)   , length(grid.φᵃᶜᵃ))
+        metric_size = (length(grid.Δλᶜᵃᵃ), length(grid.φᵃᶜᵃ))
     end
 
-    for metric in grid_metrics
-        parentM        = Symbol(metric, :_parent)
-        @eval $parentM = zeros($FT, $metric_size...)
-        @eval $metric  = OffsetArray(on_architecture($arch, $parentM), $offsets...)
-    end
+    Δxᶠᶜ = OffsetArray(zeros(FT, arch, metric_size...), offsets...)
+    Δxᶜᶠ = OffsetArray(zeros(FT, arch, metric_size...), offsets...)
+    Δxᶠᶠ = OffsetArray(zeros(FT, arch, metric_size...), offsets...)
+    Δxᶜᶜ = OffsetArray(zeros(FT, arch, metric_size...), offsets...)
+    Azᶠᶜ = OffsetArray(zeros(FT, arch, metric_size...), offsets...)
+    Azᶜᶠ = OffsetArray(zeros(FT, arch, metric_size...), offsets...)
+    Azᶠᶠ = OffsetArray(zeros(FT, arch, metric_size...), offsets...)
+    Azᶜᶜ = OffsetArray(zeros(FT, arch, metric_size...), offsets...)
 
     if grid isa YRegularLLG
-        Δyᶠᶜ = FT(0)
-        Δyᶜᶠ = FT(0)
+        Δyᶠᶜ = Δyᶠᶜᵃ(1, 1, 1, grid)
+        Δyᶜᶠ = Δyᶜᶠᵃ(1, 1, 1, grid)
     else
         parentC = zeros(FT, length(grid.Δφᵃᶜᵃ))
         parentF = zeros(FT, length(grid.Δφᵃᶜᵃ))
@@ -582,12 +579,12 @@ coordinates(::LatitudeLongitudeGrid) = (:λᶠᵃᵃ, :λᶜᵃᵃ, :φᵃᶠᵃ
 ηname(::LLG) = :φ
 rname(::LLG) = :z
 
-@inline λnode(i, grid::LLG, ::Center) = @inbounds grid.λᶜᵃᵃ[i]
-@inline λnode(i, grid::LLG, ::Face)   = @inbounds grid.λᶠᵃᵃ[i]
-@inline φnode(j, grid::LLG, ::Center) = @inbounds grid.φᵃᶜᵃ[j]
-@inline φnode(j, grid::LLG, ::Face)   = @inbounds grid.φᵃᶠᵃ[j]
-@inline znode(k, grid::LLG, ::Center) = @inbounds grid.zᵃᵃᶜ[k]
-@inline znode(k, grid::LLG, ::Face)   = @inbounds grid.zᵃᵃᶠ[k]
+@inline λnode(i, grid::LLG, ::Center) = getnode(grid.λᶜᵃᵃ, i)
+@inline λnode(i, grid::LLG, ::Face)   = getnode(grid.λᶠᵃᵃ, i)
+@inline φnode(j, grid::LLG, ::Center) = getnode(grid.φᵃᶜᵃ, j)
+@inline φnode(j, grid::LLG, ::Face)   = getnode(grid.φᵃᶠᵃ, j)
+@inline znode(k, grid::LLG, ::Center) = getnode(grid.zᵃᵃᶜ, k)
+@inline znode(k, grid::LLG, ::Face)   = getnode(grid.zᵃᵃᶠ, k)
 
 # Definitions for node
 @inline ξnode(i, j, k, grid::LLG, ℓx, ℓy, ℓz) = λnode(i, grid, ℓx)
@@ -610,10 +607,23 @@ function nodes(grid::LLG, ℓx, ℓy, ℓz; reshape=false, with_halos=false)
     z = znodes(grid, ℓx, ℓy, ℓz; with_halos)
 
     if reshape
-        N = (length(λ), length(φ), length(z))
-        λ = Base.reshape(λ, N[1], 1, 1)
-        φ = Base.reshape(φ, 1, N[2], 1)
-        z = Base.reshape(z, 1, 1, N[3])
+        # Here we have to deal with the fact that Flat directions may have
+        # `nothing` nodes.
+        #
+        # A better solution (and more consistent with the rest of the API?)
+        # might be to omit the `nothing` nodes in the `reshape`. In other words,
+        # if `TX === Flat`, then we should return `(x, z)`. This is for future
+        # consideration...
+        #
+        # See also `nodes` for `RectilinearGrid`.
+        
+        Nλ = isnothing(λ) ? 1 : length(λ)
+        Nφ = isnothing(φ) ? 1 : length(φ)
+        Nz = isnothing(z) ? 1 : length(z)
+
+        λ = isnothing(λ) ? zeros(1, 1, 1) : Base.reshape(λ, Nλ, 1, 1)
+        φ = isnothing(φ) ? zeros(1, 1, 1) : Base.reshape(φ, 1, Nφ, 1)
+        z = isnothing(z) ? zeros(1, 1, 1) : Base.reshape(z, 1, 1, Nz)
     end
 
     return (λ, φ, z)
@@ -622,25 +632,21 @@ end
 const F = Face
 const C = Center
 
-@inline λnodes(grid::LLG, ℓx::F; with_halos=false) = with_halos ? grid.λᶠᵃᵃ :
-    view(grid.λᶠᵃᵃ, interior_indices(ℓx, topology(grid, 1)(), size(grid, 1)))
-@inline λnodes(grid::LLG, ℓx::C; with_halos=false) = with_halos ? grid.λᶜᵃᵃ :
-    view(grid.λᶜᵃᵃ, interior_indices(ℓx, topology(grid, 1)(), size(grid, 1)))
-
-@inline φnodes(grid::LLG, ℓy::F; with_halos=false) = with_halos ? grid.φᵃᶠᵃ :
-    view(grid.φᵃᶠᵃ, interior_indices(ℓy, topology(grid, 2)(), size(grid, 2)))
-@inline φnodes(grid::LLG, ℓy::C; with_halos=false) = with_halos ? grid.φᵃᶜᵃ :
-    view(grid.φᵃᶜᵃ, interior_indices(ℓy, topology(grid, 2)(), size(grid, 2)))
+@inline function xnodes(grid::LLG, ℓx, ℓy; with_halos=false)
+    λ = λnodes(grid, ℓx; with_halos=with_halos)'
+    φ = φnodes(grid, ℓy; with_halos=with_halos)
+    R = grid.radius
+    return @. R * deg2rad(λ) * hack_cosd(φ)
+end
 
-@inline xnodes(grid::LLG, ℓx, ℓy; with_halos=false) =
-    grid.radius * deg2rad.(λnodes(grid, ℓx; with_halos=with_halos))' .* hack_cosd.(φnodes(grid, ℓy; with_halos=with_halos))
-@inline ynodes(grid::LLG, ℓy; with_halos=false)     =
-    grid.radius * deg2rad.(φnodes(grid, ℓy; with_halos=with_halos))
+@inline function ynodes(grid::LLG, ℓy; with_halos=false)
+    φ = φnodes(grid, ℓy; with_halos=with_halos)
+    R = grid.radius
+    return @. R * deg2rad(φ)
+end
 
-@inline znodes(grid::LLG, ℓz::F; with_halos=false) = with_halos ? grid.zᵃᵃᶠ :
-    view(grid.zᵃᵃᶠ, interior_indices(ℓz, topology(grid, 3)(), size(grid, 3)))
-@inline znodes(grid::LLG, ℓz::C; with_halos=false) = with_halos ? grid.zᵃᵃᶜ :
-    view(grid.zᵃᵃᶜ, interior_indices(ℓz, topology(grid, 3)(), size(grid, 3)))
+@inline znodes(grid::LLG, ℓz::F; with_halos=false) = _property(grid.zᵃᵃᶠ, ℓz, topology(grid, 3), size(grid, 3), with_halos)
+@inline znodes(grid::LLG, ℓz::C; with_halos=false) = _property(grid.zᵃᵃᶜ, ℓz, topology(grid, 3), size(grid, 3), with_halos)
 
 # Convenience
 @inline λnodes(grid::LLG, ℓx, ℓy, ℓz; with_halos=false) = λnodes(grid, ℓx; with_halos)
@@ -653,38 +659,45 @@ const C = Center
 ##### Grid spacings in x, y, z (in meters)
 #####
 
-@inline xspacings(grid::LLG, ℓx::C, ℓy::C; with_halos=false) = with_halos ? grid.Δxᶜᶜᵃ :
-    view(grid.Δxᶜᶜᵃ, interior_indices(ℓx, topology(grid, 1)(), grid.Nx), interior_indices(ℓy, topology(grid, 2)(), size(grid, 2)))
-@inline xspacings(grid::LLG, ℓx::C, ℓy::F;   with_halos=false) = with_halos ? grid.Δxᶜᶠᵃ :
-    view(grid.Δxᶜᶠᵃ, interior_indices(ℓx, topology(grid, 1)(), grid.Nx), interior_indices(ℓy, topology(grid, 2)(), size(grid, 2)))
-@inline xspacings(grid::LLG, ℓx::F, ℓy::C;   with_halos=false) = with_halos ? grid.Δxᶠᶜᵃ :
-    view(grid.Δxᶠᶜᵃ, interior_indices(ℓx, topology(grid, 1)(), grid.Nx), interior_indices(ℓy, topology(grid, 2)(), size(grid, 2)))
-@inline xspacings(grid::LLG, ℓx::F, ℓy::F;     with_halos=false) = with_halos ? grid.Δxᶠᶠᵃ :
-    view(grid.Δxᶠᶠᵃ, interior_indices(ℓx, topology(grid, 1)(), grid.Nx), interior_indices(ℓy, topology(grid, 2)(), size(grid, 2)))
-
-@inline xspacings(grid::HRegularLLG, ℓx::C, ℓy::C; with_halos=false) = with_halos ? grid.Δxᶜᶜᵃ :
-    view(grid.Δxᶜᶜᵃ, interior_indices(ℓy, topology(grid, 2)(), grid.Ny))
-@inline xspacings(grid::HRegularLLG, ℓx::C, ℓy::F;   with_halos=false) = with_halos ? grid.Δxᶜᶠᵃ :
-    view(grid.Δxᶜᶠᵃ, interior_indices(ℓy, topology(grid, 2)(), grid.Ny))
-@inline xspacings(grid::HRegularLLG, ℓx::F, ℓy::C;   with_halos=false) = with_halos ? grid.Δxᶠᶜᵃ :
-    view(grid.Δxᶠᶜᵃ, interior_indices(ℓy, topology(grid, 2)(), grid.Ny))
-@inline xspacings(grid::HRegularLLG, ℓx::F, ℓy::F;     with_halos=false) = with_halos ? grid.Δxᶠᶠᵃ :
-    view(grid.Δxᶠᶠᵃ, interior_indices(ℓy, topology(grid, 2)(), grid.Ny))
-
-@inline yspacings(grid::YNonRegularLLG, ℓx::C, ℓy::F;   with_halos=false) = with_halos ? grid.Δyᶜᶠᵃ :
-    view(grid.Δyᶜᶠᵃ, interior_indices(ℓy, topology(grid, 2)(), grid.Ny))
-@inline yspacings(grid::YNonRegularLLG, ℓx::F,   ℓy::C; with_halos=false) = with_halos ? grid.Δyᶠᶜᵃ :
-    view(grid.Δyᶠᶜᵃ, interior_indices(ℓy, topology(grid, 2)(), grid.Ny))
+@inline xspacings(grid::LLG, ℓx::C, ℓy::C; with_halos=false) = _property(grid.Δxᶜᶜᵃ, ℓx, ℓy,
+                                                                         topology(grid, 1), topology(grid, 2),
+                                                                         size(grid, 1), size(grid, 2), with_halos)
+
+@inline xspacings(grid::LLG, ℓx::C, ℓy::F; with_halos=false) = _property(grid.Δxᶜᶠᵃ, ℓx, ℓy,
+                                                                         topology(grid, 1), topology(grid, 2),
+                                                                         size(grid, 1), size(grid, 2), with_halos)
+
+@inline xspacings(grid::LLG, ℓx::F, ℓy::C; with_halos=false) = _property(grid.Δxᶠᶜᵃ, ℓx, ℓy,
+                                                                         topology(grid, 1), topology(grid, 2),
+                                                                         size(grid, 1), size(grid, 2), with_halos)
+ 
+@inline xspacings(grid::LLG, ℓx::F, ℓy::F; with_halos=false) = _property(grid.Δxᶠᶠᵃ, ℓx, ℓy,
+                                                                         topology(grid, 1), topology(grid, 2),
+                                                                         size(grid, 1), size(grid, 2), with_halos)
+ 
+@inline xspacings(grid::HRegularLLG, ℓx::C, ℓy::C; with_halos=false) = _property(grid.Δxᶜᶜᵃ, ℓy, topology(grid, 2),
+                                                                                 size(grid, 2), with_halos)
+
+@inline xspacings(grid::HRegularLLG, ℓx::C, ℓy::F; with_halos=false) = _property(grid.Δxᶜᶠᵃ, ℓy, topology(grid, 2),
+                                                                                 size(grid, 2), with_halos)
+
+@inline xspacings(grid::HRegularLLG, ℓx::F, ℓy::C; with_halos=false) = _property(grid.Δxᶠᶜᵃ, ℓy, topology(grid, 2),
+                                                                                 size(grid, 2), with_halos)
+
+@inline xspacings(grid::HRegularLLG, ℓx::F, ℓy::F; with_halos=false) = _property(grid.Δxᶠᶠᵃ, ℓy, topology(grid, 2),
+                                                                                 size(grid, 2), with_halos)
+
+
+@inline yspacings(grid::YNonRegularLLG, ℓx::C, ℓy::F; with_halos=false) = _property(grid.Δyᶜᶠᵃ, ℓy, topoloy(grid, 2),
+                                                                                    size(grid, 2), with_halos)
+
+@inline yspacings(grid::YNonRegularLLG, ℓx::F, ℓy::C; with_halos=false) = _property(grid.Δyᶠᶜᵃ, ℓy, topoloy(grid, 2),
+                                                                                    size(grid, 2), with_halos)
 
 @inline yspacings(grid::YRegularLLG, ℓx, ℓy; with_halos=false) = yspacings(grid, ℓy; with_halos)
 @inline yspacings(grid, ℓy::C; kwargs...) = grid.Δyᶠᶜᵃ
 @inline yspacings(grid, ℓy::F; kwargs...) = grid.Δyᶜᶠᵃ
 
-@inline zspacings(grid::LLG, ℓz::C; with_halos=false) = with_halos ? grid.Δzᵃᵃᶜ : view(grid.Δzᵃᵃᶜ, interior_indices(ℓz, topology(grid, 3)(), size(grid, 3)))
-@inline zspacings(grid::LLG, ℓz::F; with_halos=false) = with_halos ? grid.Δzᵃᵃᶠ : view(grid.Δzᵃᵃᶠ, interior_indices(ℓz, topology(grid, 3)(), size(grid, 3)))
-@inline zspacings(grid::ZRegularLLG, ℓz::C; with_halos=false) = grid.Δzᵃᵃᶜ
-@inline zspacings(grid::ZRegularLLG, ℓz::F; with_halos=false) = grid.Δzᵃᵃᶠ
-
 @inline xspacings(grid::LLG, ℓx, ℓy, ℓz; kwargs...) = xspacings(grid, ℓx, ℓy; kwargs...)
 @inline yspacings(grid::LLG, ℓx, ℓy, ℓz; kwargs...) = yspacings(grid, ℓx, ℓy; kwargs...)
 @inline zspacings(grid::LLG, ℓx, ℓy, ℓz; kwargs...) = zspacings(grid, ℓz; kwargs...)
@@ -693,18 +706,24 @@ const C = Center
 ##### Grid spacings in λ, φ (in degrees)
 #####
 
-@inline λspacings(grid::LLG, ℓx::C; with_halos=false) = with_halos ? grid.Δλᶜᵃᵃ : view(grid.Δλᶜᵃᵃ, interior_indices(ℓx, topology(grid, 1)(), grid.Nx))
-@inline λspacings(grid::LLG, ℓx::F; with_halos=false) = with_halos ? grid.Δλᶠᵃᵃ : view(grid.Δλᶠᵃᵃ, interior_indices(ℓx, topology(grid, 1)(), grid.Nx))
-@inline λspacings(grid::XRegularLLG, ℓx::C; with_halos=false) = grid.Δλᶜᵃᵃ
-@inline λspacings(grid::XRegularLLG, ℓx::F; with_halos=false) = grid.Δλᶠᵃᵃ
+@inline λnodes(grid::LLG, ℓx::F; with_halos=false) = _property(grid.λᶠᵃᵃ, ℓx, topology(grid, 1), size(grid, 1), with_halos)
+@inline λnodes(grid::LLG, ℓx::C; with_halos=false) = _property(grid.λᶜᵃᵃ, ℓx, topology(grid, 1), size(grid, 1), with_halos)
+@inline φnodes(grid::LLG, ℓy::F; with_halos=false) = _property(grid.φᵃᶠᵃ, ℓy, topology(grid, 2), size(grid, 2), with_halos)
+@inline φnodes(grid::LLG, ℓy::C; with_halos=false) = _property(grid.φᵃᶜᵃ, ℓy, topology(grid, 2), size(grid, 2), with_halos)
+
+#####
+##### Grid spacings
+#####
 
-@inline φspacings(grid::LLG, ℓy::C; with_halos=false) = with_halos ? grid.Δφᵃᶜᵃ : view(grid.Δφᵃᶜᵃ, interior_indices(ℓy, topology(grid, 2)(), grid.Ny))
-@inline φspacings(grid::LLG, ℓy::F; with_halos=false) = with_halos ? grid.Δφᵃᶠᵃ : view(grid.Δφᵃᶠᵃ, interior_indices(ℓy, topology(grid, 2)(), grid.Ny))
-@inline φspacings(grid::YRegularLLG, ℓy::C; with_halos=false) = grid.Δφᵃᶜᵃ
-@inline φspacings(grid::YRegularLLG, ℓy::F; with_halos=false) = grid.Δφᵃᶠᵃ
+@inline λspacings(grid::LLG, ℓx::C; with_halos=false) = _property(grid.Δλᶜᵃᵃ, ℓx, topology(grid, 1), size(grid, 1), with_halos)
+@inline λspacings(grid::LLG, ℓx::F; with_halos=false) = _property(grid.Δλᶠᵃᵃ, ℓx, topology(grid, 1), size(grid, 1), with_halos)
+@inline φspacings(grid::LLG, ℓy::C; with_halos=false) = _property(grid.Δφᵃᶜᵃ, ℓy, topology(grid, 2), size(grid, 2), with_halos)
+@inline φspacings(grid::LLG, ℓy::F; with_halos=false) = _property(grid.Δφᵃᶠᵃ, ℓy, topology(grid, 2), size(grid, 2), with_halos)
+@inline zspacings(grid::LLG, ℓz::C; with_halos=false) = _property(grid.Δzᵃᵃᶜ, ℓz, topology(grid, 3), size(grid, 3), with_halos)
+@inline zspacings(grid::LLG, ℓz::F; with_halos=false) = _property(grid.Δzᵃᵃᶠ, ℓz, topology(grid, 3), size(grid, 3), with_halos)
 
-@inline λspacings(grid::LLG, ℓx, ℓy, ℓz; with_halos=false) = λspacings(grid, ℓx; with_halos)
-@inline φspacings(grid::LLG, ℓx, ℓy, ℓz; with_halos=false) = φspacings(grid, ℓy; with_halos)
+@inline λspacings(grid::LLG, ℓx, ℓy, ℓz; kwargs...) = λspacings(grid, ℓx; kwargs...)
+@inline φspacings(grid::LLG, ℓx, ℓy, ℓz; kwargs...) = φspacings(grid, ℓy; kwargs...)
 
 @inline λspacing(i, grid::LLG, ::C) = @inbounds grid.Δλᶜᵃᵃ[i]
 @inline λspacing(i, grid::LLG, ::F) = @inbounds grid.Δλᶠᵃᵃ[i]
@@ -718,3 +737,4 @@ const C = Center
 
 @inline λspacing(i, j, k, grid::LLG, ℓx, ℓy, ℓz) = λspacing(i, grid, ℓx)
 @inline φspacing(i, j, k, grid::LLG, ℓx, ℓy, ℓz) = φspacing(j, grid, ℓy)
+
diff --git a/src/Grids/new_data.jl b/src/Grids/new_data.jl
index 95e713ea15..7b2bf1aac9 100644
--- a/src/Grids/new_data.jl
+++ b/src/Grids/new_data.jl
@@ -43,6 +43,7 @@ function offset_data(underlying_data::A, loc, topo, N, H, indices::T=default_ind
         Base.@_inline_meta
         axes(underlying_data, i+length(ii))
     end
+
     return OffsetArray(underlying_data, ii..., extra_ii...)
 end
 
diff --git a/src/Grids/nodes_and_spacings.jl b/src/Grids/nodes_and_spacings.jl
index 858edd0b5c..e348c0cb47 100644
--- a/src/Grids/nodes_and_spacings.jl
+++ b/src/Grids/nodes_and_spacings.jl
@@ -5,6 +5,10 @@
 ##### the first, second, and third coordinates respectively.
 #####
 
+@inline getnode(ξ, i) = @inbounds ξ[i]
+@inline getnode(::Nothing, i) = nothing
+@inline getnode(ξ::Number, i) = ξ
+
 node_names(grid, ℓx, ℓy, ℓz) = _node_names(grid, ℓx, ℓy, ℓz)
 
 node_names(grid::XFlatGrid, ℓx, ℓy, ℓz)   = _node_names(grid, nothing, ℓy, ℓz)
@@ -195,7 +199,6 @@ julia> xspacings(grid, Center(), Face(), Center())
 """
 @inline xspacings(grid, ℓx, ℓy, ℓz; with_halos=true) = xspacings(grid, ℓx; with_halos)
 
-
 """
     yspacings(grid, ℓx, ℓy, ℓz; with_halos=true)
 
@@ -233,8 +236,12 @@ julia> zspacings(grid, Center(), Center(), Center())
 destantiate(::Face)   = Face
 destantiate(::Center) = Center
 
-function minimum_spacing(dir, grid, ℓx, ℓy, ℓz)
-    spacing = eval(Symbol(dir, :spacing))
+spacing_function(::Val{:x}) = xspacing
+spacing_function(::Val{:y}) = yspacing
+spacing_function(::Val{:z}) = zspacing
+
+function minimum_spacing(s, grid, ℓx, ℓy, ℓz)
+    spacing = spacing_function(s)
     LX, LY, LZ = map(destantiate, (ℓx, ℓy, ℓz))
     Δ = KernelFunctionOperation{LX, LY, LZ}(spacing, grid, ℓx, ℓy, ℓz)
 
@@ -258,8 +265,8 @@ julia> minimum_xspacing(grid, Center(), Center(), Center())
 0.5
 ```
 """
-minimum_xspacing(grid, ℓx, ℓy, ℓz) = minimum_spacing(:x, grid, ℓx, ℓy, ℓz)
-minimum_xspacing(grid) = minimum_spacing(:x, grid, Center(), Center(), Center())
+minimum_xspacing(grid, ℓx, ℓy, ℓz) = minimum_spacing(Val(:x), grid, ℓx, ℓy, ℓz)
+minimum_xspacing(grid) = minimum_spacing(Val(:x), grid, Center(), Center(), Center())
 """
     minimum_yspacing(grid, ℓx, ℓy, ℓz)
     minimum_yspacing(grid) = minimum_yspacing(grid, Center(), Center(), Center())
@@ -277,8 +284,8 @@ julia> minimum_yspacing(grid, Center(), Center(), Center())
 0.25
 ```
 """
-minimum_yspacing(grid, ℓx, ℓy, ℓz) = minimum_spacing(:y, grid, ℓx, ℓy, ℓz)
-minimum_yspacing(grid) = minimum_spacing(:y, grid, Center(), Center(), Center())
+minimum_yspacing(grid, ℓx, ℓy, ℓz) = minimum_spacing(Val(:y), grid, ℓx, ℓy, ℓz)
+minimum_yspacing(grid) = minimum_spacing(Val(:y), grid, Center(), Center(), Center())
 
 """
     minimum_zspacing(grid, ℓx, ℓy, ℓz)
@@ -297,7 +304,6 @@ julia> minimum_zspacing(grid, Center(), Center(), Center())
 0.125
 ```
 """
-minimum_zspacing(grid, ℓx, ℓy, ℓz) = minimum_spacing(:z, grid, ℓx, ℓy, ℓz)
-minimum_zspacing(grid) = minimum_spacing(:z, grid, Center(), Center(), Center())
-
+minimum_zspacing(grid, ℓx, ℓy, ℓz) = minimum_spacing(Val(:z), grid, ℓx, ℓy, ℓz)
+minimum_zspacing(grid) = minimum_spacing(Val(:z), grid, Center(), Center(), Center())
 
diff --git a/src/Grids/orthogonal_spherical_shell_grid.jl b/src/Grids/orthogonal_spherical_shell_grid.jl
index 6d696b20ba..627f930db4 100644
--- a/src/Grids/orthogonal_spherical_shell_grid.jl
+++ b/src/Grids/orthogonal_spherical_shell_grid.jl
@@ -167,7 +167,9 @@ function conformal_cubed_sphere_panel(architecture::AbstractArchitecture = CPU()
                                       η = (-1, 1),
                                       radius = R_Earth,
                                       halo = (1, 1, 1),
-                                      rotation = nothing)
+                                      rotation = nothing,
+                                      non_uniform_conformal_mapping = false,
+                                      spacing_type = "geometric")
 
     if architecture == GPU() && !has_cuda() 
         throw(ArgumentError("Cannot create a GPU grid. No CUDA-enabled GPU was detected!"))
@@ -189,10 +191,16 @@ function conformal_cubed_sphere_panel(architecture::AbstractArchitecture = CPU()
                               topology = ξη_grid_topology,
                               x=ξ, y=η, z, halo)
 
-    ξᶠᵃᵃ = xnodes(ξη_grid, Face())
-    ξᶜᵃᵃ = xnodes(ξη_grid, Center())
-    ηᵃᶠᵃ = ynodes(ξη_grid, Face())
-    ηᵃᶜᵃ = ynodes(ξη_grid, Center())
+    if non_uniform_conformal_mapping
+        ξᶠᵃᵃ, ηᵃᶠᵃ, xᶠᶠᵃ, yᶠᶠᵃ, z = optimized_non_uniform_conformal_cubed_sphere_coordinates(Nξ+1, Nη+1, spacing_type)
+        ξᶜᵃᵃ = [0.5 * (ξᶠᵃᵃ[i] + ξᶠᵃᵃ[i+1]) for i in 1:Nξ]
+        ηᵃᶜᵃ = [0.5 * (ηᵃᶠᵃ[j] + ηᵃᶠᵃ[j+1]) for j in 1:Nη]
+    else
+        ξᶠᵃᵃ = xnodes(ξη_grid, Face())
+        ξᶜᵃᵃ = xnodes(ξη_grid, Center())
+        ηᵃᶠᵃ = ynodes(ξη_grid, Face())
+        ηᵃᶜᵃ = ynodes(ξη_grid, Center())
+    end
 
     ## The vertical coordinates and metrics can come out of the regular rectilinear grid!
      zᵃᵃᶠ = ξη_grid.zᵃᵃᶠ
@@ -970,7 +978,7 @@ function Adapt.adapt_structure(to, grid::OrthogonalSphericalShellGrid)
                                                     adapt(to, grid.Azᶜᶠᵃ),
                                                     adapt(to, grid.Azᶠᶠᵃ),
                                                     grid.radius,
-                                                    grid.conformal_mapping)
+                                                    adapt(to, grid.conformal_mapping))
 end
 
 function Base.summary(grid::OrthogonalSphericalShellGrid)
@@ -1038,7 +1046,7 @@ function Base.show(io::IO, grid::OrthogonalSphericalShellGrid, withsummary=true)
 
     λ₁, λ₂ = minimum(grid.λᶠᶠᵃ[1:Nx_face, 1:Ny_face]), maximum(grid.λᶠᶠᵃ[1:Nx_face, 1:Ny_face])
     φ₁, φ₂ = minimum(grid.φᶠᶠᵃ[1:Nx_face, 1:Ny_face]), maximum(grid.φᶠᶠᵃ[1:Nx_face, 1:Ny_face])
-    z₁, z₂ = domain(topology(grid, 3)(), Nz, grid.zᵃᵃᶠ)
+    Ωz = domain(topology(grid, 3)(), Nz, grid.zᵃᵃᶠ)
 
     (λ_center, φ_center), (extent_λ, extent_φ) = get_center_and_extents_of_shell(grid)
 
@@ -1059,17 +1067,21 @@ function Base.show(io::IO, grid::OrthogonalSphericalShellGrid, withsummary=true)
     end
 
     λ_summary = "$(TX)  extent $(prettysummary(extent_λ)) degrees"
-    φ_summary = "$(TX)  extent $(prettysummary(extent_φ)) degrees"
-    z_summary = domain_summary(TZ(), "z", z₁, z₂)
+    φ_summary = "$(TY)  extent $(prettysummary(extent_φ)) degrees"
+    z_summary = domain_summary(TZ(), "z", Ωz)
 
     longest = max(length(λ_summary), length(φ_summary), length(z_summary))
 
     padding_λ = length(λ_summary) < longest ? " "^(longest - length(λ_summary)) : ""
     padding_φ = length(φ_summary) < longest ? " "^(longest - length(φ_summary)) : ""
 
-    λ_summary = "longitude: $(TX)  extent $(prettysummary(extent_λ)) degrees" * padding_λ *" " * coordinate_summary(rad2deg.(grid.Δxᶠᶠᵃ[1:Nx_face, 1:Ny_face] ./ grid.radius), "λ")
-    φ_summary = "latitude:  $(TX)  extent $(prettysummary(extent_φ)) degrees" * padding_φ *" " * coordinate_summary(rad2deg.(grid.Δyᶠᶠᵃ[1:Nx_face, 1:Ny_face] ./ grid.radius), "φ")
-    z_summary = "z:         " * dimension_summary(TZ(), "z", z₁, z₂, grid.Δzᵃᵃᶜ, longest - length(z_summary))
+    λ_summary = "longitude: $(TX)  extent $(prettysummary(extent_λ)) degrees" * padding_λ * " " *
+                coordinate_summary(TX, rad2deg.(grid.Δxᶠᶠᵃ[1:Nx_face, 1:Ny_face] ./ grid.radius), "λ")
+
+    φ_summary = "latitude:  $(TY)  extent $(prettysummary(extent_φ)) degrees" * padding_φ * " " *
+                coordinate_summary(TY, rad2deg.(grid.Δyᶠᶠᵃ[1:Nx_face, 1:Ny_face] ./ grid.radius), "φ")
+
+    z_summary = "z:         " * dimension_summary(TZ(), "z", Ωz, grid.Δzᵃᵃᶜ, longest - length(z_summary))
 
     if withsummary
         print(io, summary(grid), "\n")
diff --git a/src/Grids/rectilinear_grid.jl b/src/Grids/rectilinear_grid.jl
index 56cb892d4e..57313d87a4 100644
--- a/src/Grids/rectilinear_grid.jl
+++ b/src/Grids/rectilinear_grid.jl
@@ -66,7 +66,6 @@ stretched_dimensions(::YZRegularRG) = tuple(1)
 stretched_dimensions(::XZRegularRG) = tuple(2)
 stretched_dimensions(::XYRegularRG) = tuple(3)
 
-
 """
     RectilinearGrid([architecture = CPU(), FT = Float64];
                     size,
@@ -200,7 +199,7 @@ julia> grid = RectilinearGrid(size=256, z=(-128, 0), topology=(Flat, Flat, Bound
 1×1×256 RectilinearGrid{Float64, Flat, Flat, Bounded} on CPU with 0×0×3 halo
 ├── Flat x
 ├── Flat y
-└── Bounded  z ∈ [-128.0, 0.0]    regularly spaced with Δz=0.5
+└── Bounded  z ∈ [-128.0, 0.0] regularly spaced with Δz=0.5
 ```
 
 * A horizontally-periodic regular grid with cell interfaces stretched hyperbolically near the top:
@@ -275,11 +274,11 @@ function RectilinearGrid(architecture::AbstractArchitecture = CPU(),
     Lx, xᶠᵃᵃ, xᶜᵃᵃ, Δxᶠᵃᵃ, Δxᶜᵃᵃ = generate_coordinate(FT, TX(), Nx, Hx, x, :x, architecture)
     Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, TY(), Ny, Hy, y, :y, architecture)
     Lz, zᵃᵃᶠ, zᵃᵃᶜ, Δzᵃᵃᶠ, Δzᵃᵃᶜ = generate_coordinate(FT, TZ(), Nz, Hz, z, :z, architecture)
- 
+
     return RectilinearGrid{TX, TY, TZ}(architecture,
                                        Nx, Ny, Nz,
                                        Hx, Hy, Hz,
-                                       FT(Lx), FT(Ly), FT(Lz),
+                                       Lx, Ly, Lz,
                                        Δxᶠᵃᵃ, Δxᶜᵃᵃ, xᶠᵃᵃ, xᶜᵃᵃ,
                                        Δyᵃᶠᵃ, Δyᵃᶜᵃ, yᵃᶠᵃ, yᵃᶜᵃ,
                                        Δzᵃᵃᶠ, Δzᵃᵃᶜ, zᵃᵃᶠ, zᵃᵃᶜ)
@@ -321,19 +320,19 @@ end
 function Base.show(io::IO, grid::RectilinearGrid, withsummary=true)
     TX, TY, TZ = topology(grid)
 
-    x₁, x₂ = domain(TX(), grid.Nx, grid.xᶠᵃᵃ)
-    y₁, y₂ = domain(TY(), grid.Ny, grid.yᵃᶠᵃ)
-    z₁, z₂ = domain(TZ(), grid.Nz, grid.zᵃᵃᶠ)
+    Ωx = domain(TX(), grid.Nx, grid.xᶠᵃᵃ)
+    Ωy = domain(TY(), grid.Ny, grid.yᵃᶠᵃ)
+    Ωz = domain(TZ(), grid.Nz, grid.zᵃᵃᶠ)
 
-    x_summary = domain_summary(TX(), "x", x₁, x₂)
-    y_summary = domain_summary(TY(), "y", y₁, y₂)
-    z_summary = domain_summary(TZ(), "z", z₁, z₂)
+    x_summary = domain_summary(TX(), "x", Ωx)
+    y_summary = domain_summary(TY(), "y", Ωy)
+    z_summary = domain_summary(TZ(), "z", Ωz)
 
     longest = max(length(x_summary), length(y_summary), length(z_summary))
 
-    x_summary = dimension_summary(TX(), "x", x₁, x₂, grid.Δxᶜᵃᵃ, longest - length(x_summary))
-    y_summary = dimension_summary(TY(), "y", y₁, y₂, grid.Δyᵃᶜᵃ, longest - length(y_summary))
-    z_summary = dimension_summary(TZ(), "z", z₁, z₂, grid.Δzᵃᵃᶜ, longest - length(z_summary))
+    x_summary = dimension_summary(TX(), "x", Ωx, grid.Δxᶜᵃᵃ, longest - length(x_summary))
+    y_summary = dimension_summary(TY(), "y", Ωy, grid.Δyᵃᶜᵃ, longest - length(y_summary))
+    z_summary = dimension_summary(TZ(), "z", Ωz, grid.Δzᵃᵃᶜ, longest - length(z_summary))
 
     if withsummary
         print(io, summary(grid), "\n")
@@ -372,42 +371,73 @@ cpu_face_constructor_x(grid::XRegularRG) = x_domain(grid)
 cpu_face_constructor_y(grid::YRegularRG) = y_domain(grid)
 cpu_face_constructor_z(grid::ZRegularRG) = z_domain(grid)
 
-function with_halo(new_halo, old_grid::RectilinearGrid)
+function constructor_arguments(grid::RectilinearGrid)
+    arch = architecture(grid)
+    FT = eltype(grid)
+    args = Dict(:architecture => arch, :number_type => eltype(grid))
+
+    # Kwargs
+    topo = topology(grid)
+    size = (grid.Nx, grid.Ny, grid.Nz)
+    halo = (grid.Hx, grid.Hy, grid.Hz)
+    size = pop_flat_elements(size, topo)
+    halo = pop_flat_elements(halo, topo)
+
+    kwargs = Dict(:size => size,
+                  :halo => halo,
+                  :x => cpu_face_constructor_x(grid),
+                  :y => cpu_face_constructor_y(grid),
+                  :z => cpu_face_constructor_z(grid),
+                  :topology => topo) 
+
+    return args, kwargs
+end
 
-    size = (old_grid.Nx, old_grid.Ny, old_grid.Nz)
-    topo = topology(old_grid)
+function Base.similar(grid::RectilinearGrid)
+    args, kwargs = constructor_arguments(grid)
+    arch = args[:architecture]
+    FT = args[:number_type]
+    return RectilinearGrid(arch, FT; kwargs...)
+end
 
-    x = cpu_face_constructor_x(old_grid)
-    y = cpu_face_constructor_y(old_grid)
-    z = cpu_face_constructor_z(old_grid)
+"""
+    with_number_type(number_type, grid)
 
-    # Remove elements of size and new_halo in Flat directions as expected by grid
-    # constructor
-    size     = pop_flat_elements(size, topo)
-    new_halo = pop_flat_elements(new_halo, topo)
+Return a `new_grid` that's identical to `grid` but with `number_type`.
+"""
+function with_number_type(FT, grid::RectilinearGrid)
+    args, kwargs = constructor_arguments(grid)
+    arch = args[:architecture]
+    return RectilinearGrid(arch, FT; kwargs...)
+end
 
-    new_grid = RectilinearGrid(architecture(old_grid), eltype(old_grid);
-                               size, x, y, z,
-                               topology = topo,
-                               halo = new_halo)
+"""
+    with_halo(halo, grid)
 
-    return new_grid
+Return a `new_grid` that's identical to `grid` but with `halo`.
+"""
+function with_halo(halo, grid::RectilinearGrid)
+    args, kwargs = constructor_arguments(grid)
+    halo = pop_flat_elements(halo, topology(grid))
+    kwargs[:halo] = halo
+    arch = args[:architecture]
+    FT = args[:number_type]
+    return RectilinearGrid(arch, FT; kwargs...)
 end
 
-function on_architecture(new_arch::AbstractSerialArchitecture, old_grid::RectilinearGrid)
-    old_properties = (old_grid.Δxᶠᵃᵃ, old_grid.Δxᶜᵃᵃ, old_grid.xᶠᵃᵃ, old_grid.xᶜᵃᵃ,
-                      old_grid.Δyᵃᶠᵃ, old_grid.Δyᵃᶜᵃ, old_grid.yᵃᶠᵃ, old_grid.yᵃᶜᵃ,
-                      old_grid.Δzᵃᵃᶠ, old_grid.Δzᵃᵃᶜ, old_grid.zᵃᵃᶠ, old_grid.zᵃᵃᶜ)
-
-    new_properties = Tuple(on_architecture(new_arch, p) for p in old_properties)
+"""
+    on_architecture(architecture, grid)
 
-    TX, TY, TZ = topology(old_grid)
+Return a `new_grid` that's identical to `grid` but on `architecture`.
+"""
+function on_architecture(arch::AbstractSerialArchitecture, grid::RectilinearGrid)
+    if arch == architecture(grid)
+        return grid
+    end
 
-    return RectilinearGrid{TX, TY, TZ}(new_arch,
-                                       old_grid.Nx, old_grid.Ny, old_grid.Nz,
-                                       old_grid.Hx, old_grid.Hy, old_grid.Hz,
-                                       old_grid.Lx, old_grid.Ly, old_grid.Lz,
-                                       new_properties...)
+    args, kwargs = constructor_arguments(grid)
+    FT = args[:number_type]
+    return RectilinearGrid(arch, FT; kwargs...)
 end
 
 coordinates(::RectilinearGrid) = (:xᶠᵃᵃ, :xᶜᵃᵃ, :yᵃᶠᵃ, :yᵃᶜᵃ, :zᵃᵃᶠ, :zᵃᵃᶜ)
@@ -420,12 +450,12 @@ coordinates(::RectilinearGrid) = (:xᶠᵃᵃ, :xᶜᵃᵃ, :yᵃᶠᵃ, :yᵃ
 ηname(::RG) = :y
 rname(::RG) = :z
 
-@inline xnode(i, grid::RG, ::Center) = @inbounds grid.xᶜᵃᵃ[i]
-@inline xnode(i, grid::RG, ::Face)   = @inbounds grid.xᶠᵃᵃ[i]
-@inline ynode(j, grid::RG, ::Center) = @inbounds grid.yᵃᶜᵃ[j]
-@inline ynode(j, grid::RG, ::Face)   = @inbounds grid.yᵃᶠᵃ[j]
-@inline znode(k, grid::RG, ::Center) = @inbounds grid.zᵃᵃᶜ[k]
-@inline znode(k, grid::RG, ::Face)   = @inbounds grid.zᵃᵃᶠ[k]
+@inline xnode(i, grid::RG, ::Center) = getnode(grid.xᶜᵃᵃ, i)
+@inline xnode(i, grid::RG, ::Face)   = getnode(grid.xᶠᵃᵃ, i)
+@inline ynode(j, grid::RG, ::Center) = getnode(grid.yᵃᶜᵃ, j)
+@inline ynode(j, grid::RG, ::Face)   = getnode(grid.yᵃᶠᵃ, j)
+@inline znode(k, grid::RG, ::Center) = getnode(grid.zᵃᵃᶜ, k)
+@inline znode(k, grid::RG, ::Face)   = getnode(grid.zᵃᵃᶠ, k)
 
 @inline ξnode(i, j, k, grid::RG, ℓx, ℓy, ℓz) = xnode(i, grid, ℓx)
 @inline ηnode(i, j, k, grid::RG, ℓx, ℓy, ℓz) = ynode(j, grid, ℓy)
@@ -442,10 +472,23 @@ function nodes(grid::RectilinearGrid, ℓx, ℓy, ℓz; reshape=false, with_halo
     z = znodes(grid, ℓx, ℓy, ℓz; with_halos)
 
     if reshape
-        N = (length(x), length(y), length(z))
-        x = Base.reshape(x, N[1], 1, 1)
-        y = Base.reshape(y, 1, N[2], 1)
-        z = Base.reshape(z, 1, 1, N[3])
+        # Here we have to deal with the fact that Flat directions may have
+        # `nothing` nodes.
+        #
+        # A better solution (and more consistent with the rest of the API?)
+        # might be to omit the `nothing` nodes in the `reshape`. In other words,
+        # if `TX === Flat`, then we should return `(x, z)`. This is for future
+        # consideration...
+        # 
+        # See also `nodes` for `LatitudeLongitudeGrid`.
+
+        Nx = isnothing(x) ? 1 : length(x)
+        Ny = isnothing(y) ? 1 : length(y)
+        Nz = isnothing(z) ? 1 : length(z)
+
+        x = isnothing(x) ? zeros(1, 1, 1) : Base.reshape(x, Nx, 1, 1)
+        y = isnothing(y) ? zeros(1, 1, 1) : Base.reshape(y, 1, Ny, 1)
+        z = isnothing(z) ? zeros(1, 1, 1) : Base.reshape(z, 1, 1, Nz)
     end
 
     return (x, y, z)
@@ -454,14 +497,12 @@ end
 const F = Face
 const C = Center
 
-@inline xnodes(grid::RG, ℓx::F; with_halos=false) = with_halos ? grid.xᶠᵃᵃ : view(grid.xᶠᵃᵃ, interior_indices(ℓx, topology(grid, 1)(), size(grid, 1)))
-@inline xnodes(grid::RG, ℓx::C; with_halos=false) = with_halos ? grid.xᶜᵃᵃ : view(grid.xᶜᵃᵃ, interior_indices(ℓx, topology(grid, 1)(), size(grid, 1)))
-
-@inline ynodes(grid::RG, ℓy::F; with_halos=false) = with_halos ? grid.yᵃᶠᵃ : view(grid.yᵃᶠᵃ, interior_indices(ℓy, topology(grid, 2)(), size(grid, 2)))
-@inline ynodes(grid::RG, ℓy::C; with_halos=false) = with_halos ? grid.yᵃᶜᵃ : view(grid.yᵃᶜᵃ, interior_indices(ℓy, topology(grid, 2)(), size(grid, 2)))
-
-@inline znodes(grid::RG, ℓz::F; with_halos=false) = with_halos ? grid.zᵃᵃᶠ : view(grid.zᵃᵃᶠ, interior_indices(ℓz, topology(grid, 3)(), size(grid, 3)))
-@inline znodes(grid::RG, ℓz::C; with_halos=false) = with_halos ? grid.zᵃᵃᶜ : view(grid.zᵃᵃᶜ, interior_indices(ℓz, topology(grid, 3)(), size(grid, 3)))
+@inline xnodes(grid::RG, ℓx::F; with_halos=false) = _property(grid.xᶠᵃᵃ, ℓx, topology(grid, 1), size(grid, 1), with_halos)
+@inline xnodes(grid::RG, ℓx::C; with_halos=false) = _property(grid.xᶜᵃᵃ, ℓx, topology(grid, 1), size(grid, 1), with_halos)
+@inline ynodes(grid::RG, ℓy::F; with_halos=false) = _property(grid.yᵃᶠᵃ, ℓy, topology(grid, 2), size(grid, 2), with_halos)
+@inline ynodes(grid::RG, ℓy::C; with_halos=false) = _property(grid.yᵃᶜᵃ, ℓy, topology(grid, 2), size(grid, 2), with_halos)
+@inline znodes(grid::RG, ℓz::F; with_halos=false) = _property(grid.zᵃᵃᶠ, ℓz, topology(grid, 3), size(grid, 3), with_halos)
+@inline znodes(grid::RG, ℓz::C; with_halos=false) = _property(grid.zᵃᵃᶜ, ℓz, topology(grid, 3), size(grid, 3), with_halos)
 
 # convenience
 @inline xnodes(grid::RG, ℓx, ℓy, ℓz; with_halos=false) = xnodes(grid, ℓx; with_halos)
@@ -472,23 +513,16 @@ const C = Center
 ##### Grid spacings
 #####
 
-@inline xspacings(grid::RG,         ℓx::C; with_halos=false) = with_halos ? grid.Δxᶜᵃᵃ : view(grid.Δxᶜᵃᵃ, interior_indices(ℓx, topology(grid, 1)(), size(grid, 1)))
-@inline xspacings(grid::XRegularRG, ℓx::C; with_halos=false) = grid.Δxᶜᵃᵃ
-@inline xspacings(grid::RG,         ℓx::F; with_halos=false) = with_halos ? grid.Δxᶠᵃᵃ : view(grid.Δxᶠᵃᵃ, interior_indices(ℓx, topology(grid, 1)(), size(grid, 1)))
-@inline xspacings(grid::XRegularRG, ℓx::F; with_halos=false) = grid.Δxᶠᵃᵃ
-
-@inline yspacings(grid::RG,         ℓy::C; with_halos=false) = with_halos ? grid.Δyᵃᶜᵃ : view(grid.Δyᵃᶜᵃ, interior_indices(ℓy, topology(grid, 2)(), size(grid, 2)))
-@inline yspacings(grid::YRegularRG, ℓy::C; with_halos=false) = grid.Δyᵃᶜᵃ
-@inline yspacings(grid::RG,         ℓy::F; with_halos=false) = with_halos ? grid.Δyᵃᶠᵃ : view(grid.Δyᵃᶠᵃ, interior_indices(ℓy, topology(grid, 2)(), size(grid, 2)))
-@inline yspacings(grid::YRegularRG, ℓy::F; with_halos=false) = grid.Δyᵃᶠᵃ
-
-@inline zspacings(grid::RG,         ℓz::C; with_halos=false) = with_halos ? grid.Δzᵃᵃᶜ : view(grid.Δzᵃᵃᶜ, interior_indices(ℓz, topology(grid, 3)(), size(grid, 3)))
-@inline zspacings(grid::ZRegularRG, ℓz::C; with_halos=false) = grid.Δzᵃᵃᶜ
-@inline zspacings(grid::RG,         ℓz::F; with_halos=false) = with_halos ? grid.Δzᵃᵃᶠ : view(grid.Δzᵃᵃᶠ, interior_indices(ℓz, topology(grid, 3)(), size(grid, 3)))
-@inline zspacings(grid::ZRegularRG, ℓz::F; with_halos=false) = grid.Δzᵃᵃᶠ
+@inline xspacings(grid::RG, ℓx::C; with_halos=false) = _property(grid.Δxᶜᵃᵃ, ℓx, topology(grid, 1), size(grid, 1), with_halos)
+@inline xspacings(grid::RG, ℓx::F; with_halos=false) = _property(grid.Δxᶠᵃᵃ, ℓx, topology(grid, 1), size(grid, 1), with_halos)
+@inline yspacings(grid::RG, ℓy::C; with_halos=false) = _property(grid.Δyᵃᶜᵃ, ℓy, topology(grid, 2), size(grid, 2), with_halos)
+@inline yspacings(grid::RG, ℓy::F; with_halos=false) = _property(grid.Δyᵃᶠᵃ, ℓy, topology(grid, 2), size(grid, 2), with_halos)
+@inline zspacings(grid::RG, ℓz::C; with_halos=false) = _property(grid.Δzᵃᵃᶜ, ℓz, topology(grid, 3), size(grid, 3), with_halos)
+@inline zspacings(grid::RG, ℓz::F; with_halos=false) = _property(grid.Δzᵃᵃᶠ, ℓz, topology(grid, 3), size(grid, 3), with_halos)
 
 @inline xspacings(grid::RG, ℓx, ℓy, ℓz; kwargs...) = xspacings(grid, ℓx; kwargs...)
 @inline yspacings(grid::RG, ℓx, ℓy, ℓz; kwargs...) = yspacings(grid, ℓy; kwargs...)
 @inline zspacings(grid::RG, ℓx, ℓy, ℓz; kwargs...) = zspacings(grid, ℓz; kwargs...)
 
 @inline isrectilinear(::RG) = true
+
diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index 1270592abf..a4de847eea 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -50,6 +50,7 @@ import Oceananigans.Grids:  cpu_face_constructor_x, cpu_face_constructor_y, cpu_
 import Oceananigans.Grids: architecture, on_architecture, with_halo, inflate_halo_size_one_dimension,
                            xnode, ynode, znode, λnode, φnode, node,
                            ξnode, ηnode, rnode,
+                           ξname, ηname, rname, node_names,
                            xnodes, ynodes, znodes, λnodes, φnodes, nodes,
                            inactive_cell
 
@@ -135,7 +136,7 @@ const IBG = ImmersedBoundaryGrid
 @inline get_ibg_property(ibg::IBG, ::Val{:immersed_boundary})      = getfield(ibg, :immersed_boundary)
 @inline get_ibg_property(ibg::IBG, ::Val{:underlying_grid})        = getfield(ibg, :underlying_grid)
 @inline get_ibg_property(ibg::IBG, ::Val{:interior_active_cells})  = getfield(ibg, :interior_active_cells)
-@inline get_ibg_property(ibg::IBG, ::Val{:active_z_columns})   = getfield(ibg, :active_z_columns)
+@inline get_ibg_property(ibg::IBG, ::Val{:active_z_columns})       = getfield(ibg, :active_z_columns)
 
 @inline architecture(ibg::IBG) = architecture(ibg.underlying_grid)
 
@@ -146,8 +147,8 @@ const IBG = ImmersedBoundaryGrid
 Adapt.adapt_structure(to, ibg::IBG{FT, TX, TY, TZ}) where {FT, TX, TY, TZ} =
     ImmersedBoundaryGrid{TX, TY, TZ}(adapt(to, ibg.underlying_grid), 
                                      adapt(to, ibg.immersed_boundary), 
-                                     adapt(to, ibg.interior_active_cells), 
-                                     adapt(to, ibg.active_z_columns))
+                                     nothing, 
+                                     nothing)
 
 with_halo(halo, ibg::ImmersedBoundaryGrid) =
     ImmersedBoundaryGrid(with_halo(halo, ibg.underlying_grid), ibg.immersed_boundary)
@@ -268,10 +269,21 @@ xnodes(ibg::IBG, loc; kwargs...) = xnodes(ibg.underlying_grid, loc; kwargs...)
 ynodes(ibg::IBG, loc; kwargs...) = ynodes(ibg.underlying_grid, loc; kwargs...)
 znodes(ibg::IBG, loc; kwargs...) = znodes(ibg.underlying_grid, loc; kwargs...)
 
+λnodes(ibg::IBG, ℓx, ℓy, ℓz; kwargs...) = λnodes(ibg.underlying_grid, ℓx, ℓy, ℓz; kwargs...)
+φnodes(ibg::IBG, ℓx, ℓy, ℓz; kwargs...) = φnodes(ibg.underlying_grid, ℓx, ℓy, ℓz; kwargs...)
+xnodes(ibg::IBG, ℓx, ℓy, ℓz; kwargs...) = xnodes(ibg.underlying_grid, ℓx, ℓy, ℓz; kwargs...)
+ynodes(ibg::IBG, ℓx, ℓy, ℓz; kwargs...) = ynodes(ibg.underlying_grid, ℓx, ℓy, ℓz; kwargs...)
+znodes(ibg::IBG, ℓx, ℓy, ℓz; kwargs...) = znodes(ibg.underlying_grid, ℓx, ℓy, ℓz; kwargs...)
+
 @inline cpu_face_constructor_x(ibg::IBG) = cpu_face_constructor_x(ibg.underlying_grid)
 @inline cpu_face_constructor_y(ibg::IBG) = cpu_face_constructor_y(ibg.underlying_grid)
 @inline cpu_face_constructor_z(ibg::IBG) = cpu_face_constructor_z(ibg.underlying_grid)
 
+node_names(ibg::IBG, ℓx, ℓy, ℓz) = node_names(ibg.underlying_grid, ℓx, ℓy, ℓz)
+ξname(ibg::IBG) = ξname(ibg.underlying_grid)
+ηname(ibg::IBG) = ηname(ibg.underlying_grid)
+rname(ibg::IBG) = rname(ibg.underlying_grid)
+
 function on_architecture(arch, ibg::IBG)
     underlying_grid   = on_architecture(arch, ibg.underlying_grid)
     immersed_boundary = on_architecture(arch, ibg.immersed_boundary)
diff --git a/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl b/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
index 219d41e2cb..dcc442b6ac 100644
--- a/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
+++ b/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
@@ -53,3 +53,20 @@ const AGFB = AbstractGridFittedBoundary
     @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, Flat, Flat, Flat},   ib::AGFB) = _immersed_cell(1, 1, 1, grid, ib)
 end
 
+function clamp_bottom_height!(bottom_field, grid)
+    launch!(architecture(grid), grid, :xy, _clamp_bottom_height!, bottom_field, grid)
+    return nothing
+end
+
+const c = Center()
+const f = Face()
+
+@kernel function _clamp_bottom_height!(z, grid)
+    i, j = @index(Global, NTuple)
+    Nz = size(grid, 3)
+    zmin = znode(i, j, 1,    grid, c, c, f)
+    zmax = znode(i, j, Nz+1, grid, c, c, f)
+    @inbounds z[i, j, 1] = clamp(z[i, j, 1], zmin, zmax)
+end
+
+
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 87e0692482..10caefb386 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -4,45 +4,48 @@ using Oceananigans.Grids: AbstractGrid
 
 using KernelAbstractions: @kernel, @index
 
-import Oceananigans.Grids: active_surface_map, active_interior_map
+import Oceananigans.Grids: retrieve_surface_active_cells_map, retrieve_interior_active_cells_map
 import Oceananigans.Utils: active_cells_work_layout
 
 using Oceananigans.Solvers: solve_batched_tridiagonal_system_z!, ZDirection
-using Oceananigans.DistributedComputations: DistributedGrid
+using Oceananigans.DistributedComputations: DistributedGrid, SynchronizedDistributed
 
 import Oceananigans.Solvers: solve_batched_tridiagonal_system_kernel!
 
-const DistributedActiveCellsIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:DistributedGrid, <:Any, <:NamedTuple} # Cannot be used to dispatch in kernels!!!
-const ArrayActiveCellsIBG       = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractArray}
-const NamedTupleActiveCellsIBG  = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:NamedTuple}
+# REMEMBER: since the active map is stripped out of the grid when `Adapt`ing to the GPU, 
+# The following types cannot be used to dispatch in kernels!!!
+
+# An IBG with a single interior active cells map that includes the whole :xyz domain
+const WholeActiveCellsMapIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractArray}
+
+# An IBG with an interior active cells map subdivided in 5 different sub-maps.
+# Only used (for the moment) in the case of distributed architectures where the boundary adjacent region 
+# has to be computed separately, these maps hold the active region in the "halo-independent" part of the domain
+# (; halo_independent_cells), and the "halo-dependent" regions in the west, east, north, and south, respectively
+const SplitActiveCellsMapIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:NamedTuple}
+
+# A distributed grid with split interior map
+const DistributedActiveCellsIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:DistributedGrid, <:Any, <:NamedTuple} 
 
 """
 A constant representing an immersed boundary grid, where interior active cells are mapped to linear indices in grid.interior_active_cells
 """
-const ActiveCellsIBG = Union{DistributedActiveCellsIBG, ArrayActiveCellsIBG, NamedTupleActiveCellsIBG}
+const ActiveCellsIBG = Union{DistributedActiveCellsIBG, WholeActiveCellsMapIBG, SplitActiveCellsMapIBG}
 
 """
 A constant representing an immersed boundary grid, where active columns in the Z-direction are mapped to linear indices in grid.active_z_columns
 """
 const ActiveZColumnsIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractArray}
 
-struct InteriorMap end
-struct ZColumnMap end
-
-struct WestMap  end
-struct EastMap  end
-struct SouthMap end
-struct NorthMap end
+@inline retrieve_surface_active_cells_map(grid::ActiveZColumnsIBG) = grid.active_z_columns
 
-@inline active_surface_map(::ActiveZColumnsIBG) = ZColumnMap()
-
-@inline active_interior_map(::Val{:west})  = WestMap()
-@inline active_interior_map(::Val{:east})  = EastMap()
-@inline active_interior_map(::Val{:south}) = SouthMap()
-@inline active_interior_map(::Val{:north}) = NorthMap()
-
-@inline active_interior_map(::ActiveCellsIBG)            = InteriorMap()
-@inline active_interior_map(::DistributedActiveCellsIBG) = InteriorMap()
+@inline retrieve_interior_active_cells_map(grid::WholeActiveCellsMapIBG, ::Val{:interior}) = grid.interior_active_cells
+@inline retrieve_interior_active_cells_map(grid::SplitActiveCellsMapIBG, ::Val{:interior}) = grid.interior_active_cells.halo_independent_cells
+@inline retrieve_interior_active_cells_map(grid::SplitActiveCellsMapIBG, ::Val{:west})     = grid.interior_active_cells.west_halo_dependent_cells
+@inline retrieve_interior_active_cells_map(grid::SplitActiveCellsMapIBG, ::Val{:east})     = grid.interior_active_cells.east_halo_dependent_cells
+@inline retrieve_interior_active_cells_map(grid::SplitActiveCellsMapIBG, ::Val{:south})    = grid.interior_active_cells.south_halo_dependent_cells
+@inline retrieve_interior_active_cells_map(grid::SplitActiveCellsMapIBG, ::Val{:north})    = grid.interior_active_cells.north_halo_dependent_cells
+@inline retrieve_interior_active_cells_map(grid::ActiveZColumnsIBG,      ::Val{:surface})  = grid.active_z_columns
 
 """
     active_cells_work_layout(group, size, map_type, grid)
@@ -52,19 +55,12 @@ Compute the work layout for active cells based on the given map type and grid.
 # Arguments
 - `group`: The previous workgroup.
 - `size`: The previous worksize.
-- `map_type`: The type of map (e.g., `InteriorMap`, `WestMap`, `EastMap`, `SouthMap`, `NorthMap`).
-- `grid`: The grid containing the active cells.
+- `active_cells_map`: The map containing the index of the active cells
 
 # Returns
 - A tuple `(workgroup, worksize)` representing the work layout for active cells.
 """
-@inline active_cells_work_layout(group, size, ::InteriorMap, grid::ArrayActiveCellsIBG)      = min(length(grid.interior_active_cells), 256),          length(grid.interior_active_cells)
-@inline active_cells_work_layout(group, size, ::InteriorMap, grid::NamedTupleActiveCellsIBG) = min(length(grid.interior_active_cells.interior), 256), length(grid.interior_active_cells.interior)
-@inline active_cells_work_layout(group, size, ::WestMap,     grid::NamedTupleActiveCellsIBG) = min(length(grid.interior_active_cells.west),     256), length(grid.interior_active_cells.west)
-@inline active_cells_work_layout(group, size, ::EastMap,     grid::NamedTupleActiveCellsIBG) = min(length(grid.interior_active_cells.east),     256), length(grid.interior_active_cells.east)
-@inline active_cells_work_layout(group, size, ::SouthMap,    grid::NamedTupleActiveCellsIBG) = min(length(grid.interior_active_cells.south),    256), length(grid.interior_active_cells.south)
-@inline active_cells_work_layout(group, size, ::NorthMap,    grid::NamedTupleActiveCellsIBG) = min(length(grid.interior_active_cells.north),    256), length(grid.interior_active_cells.north)
-@inline active_cells_work_layout(group, size, ::ZColumnMap,  grid::ActiveZColumnsIBG)        = min(length(grid.active_z_columns),  256),              length(grid.active_z_columns)
+@inline active_cells_work_layout(group, size, active_cells_map) = min(length(active_cells_map), 256), length(active_cells_map)
 
 """
     active_linear_index_to_tuple(idx, map, grid)
@@ -73,19 +69,12 @@ Converts a linear index to a tuple of indices based on the given map and grid.
 
 # Arguments
 - `idx`: The linear index to convert.
-- `map`: The map indicating the type of index conversion to perform.
-- `grid`: The grid containing the active cells.
+- `active_cells_map`: The map containing the N-dimensional index of the active cells
 
 # Returns
 A tuple of indices corresponding to the linear index.
 """
-@inline active_linear_index_to_tuple(idx, ::InteriorMap, grid::ArrayActiveCellsIBG)      = Base.map(Int, grid.interior_active_cells[idx])
-@inline active_linear_index_to_tuple(idx, ::InteriorMap, grid::NamedTupleActiveCellsIBG) = Base.map(Int, grid.interior_active_cells.interior[idx])
-@inline active_linear_index_to_tuple(idx, ::WestMap,     grid::NamedTupleActiveCellsIBG) = Base.map(Int, grid.interior_active_cells.west[idx])
-@inline active_linear_index_to_tuple(idx, ::EastMap,     grid::NamedTupleActiveCellsIBG) = Base.map(Int, grid.interior_active_cells.east[idx])
-@inline active_linear_index_to_tuple(idx, ::SouthMap,    grid::NamedTupleActiveCellsIBG) = Base.map(Int, grid.interior_active_cells.south[idx])
-@inline active_linear_index_to_tuple(idx, ::NorthMap,    grid::NamedTupleActiveCellsIBG) = Base.map(Int, grid.interior_active_cells.north[idx])
-@inline active_linear_index_to_tuple(idx, ::ZColumnMap,  grid::ActiveZColumnsIBG)         = Base.map(Int, grid.active_z_columns[idx])
+@inline active_linear_index_to_tuple(idx, active_cells_map) = @inbounds Base.map(Int, active_cells_map[idx])
 
 function ImmersedBoundaryGrid(grid, ib; active_cells_map::Bool = true) 
 
@@ -126,14 +115,24 @@ function compute_interior_active_cells(ibg; parameters = :xyz)
 end
 
 function compute_active_z_columns(ibg)
-    one_field = ConditionalOperation{Center, Center, Center}(OneField(Int), identity, ibg, NotImmersed(truefunc), 0)
-    column    = sum(one_field, dims = 3)
-    is_immersed_column = KernelFunctionOperation{Center, Center, Nothing}(active_column, ibg, column)
-    active_cells_field = Field{Center, Center, Nothing}(ibg, Bool)
-    set!(active_cells_field, is_immersed_column)
-    return active_cells_field
+    one_field = OneField(Int)
+    condition = NotImmersed(truefunc)
+    mask = 0
+
+    # Compute all the active cells in a z-column using a ConditionalOperation
+    conditional_active_cells = ConditionalOperation{Center, Center, Center}(one_field, identity, ibg, condition, mask)
+    active_cells_in_column   = sum(conditional_active_cells, dims = 3)
+
+    # Check whether the column ``i, j`` is immersed, which would correspond to `active_cells_in_column[i, j, 1] == 0`
+    is_immersed_column = KernelFunctionOperation{Center, Center, Nothing}(active_column, ibg, active_cells_in_column)
+    active_z_columns = Field{Center, Center, Nothing}(ibg, Bool)
+    set!(active_z_columns, is_immersed_column)
+
+    return active_z_columns
 end
 
+# Maximum integer represented by the 
+# `UInt8`, `UInt16` and `UInt32` types
 const MAXUInt8  = 2^8  - 1
 const MAXUInt16 = 2^16 - 1
 const MAXUInt32 = 2^32 - 1
@@ -141,7 +140,8 @@ const MAXUInt32 = 2^32 - 1
 """
     interior_active_indices(ibg; parameters = :xyz)
 
-Compute the indices of the active interior cells in the given immersed boundary grid.
+Compute the indices of the active interior cells in the given immersed boundary grid within the indices
+specified by the `parameters` keyword argument
 
 # Arguments
 - `ibg`: The immersed boundary grid.
@@ -158,7 +158,9 @@ function interior_active_indices(ibg; parameters = :xyz)
    
     IndicesType = Tuple{IntType, IntType, IntType}
 
-    # Cannot findall on the entire field because we incur on OOM errors
+    # Cannot findall on the entire field because we could incur on OOM errors
+    # For this reason, we split the computation in vertical levels and `findall` the active indices in 
+    # subsequent xy planes, then stitch them back together
     active_indices = IndicesType[]
     active_indices = findall_active_indices!(active_indices, active_cells_field, ibg, IndicesType)
     active_indices = on_architecture(architecture(ibg), active_indices)
@@ -187,15 +189,32 @@ function convert_interior_indices(interior_indices, k, IndicesType)
     return interior_indices
 end
 
-@inline add_3rd_index(t::Tuple, k) = (t[1], t[2], k) 
+@inline add_3rd_index(ij::Tuple, k) = (ij[1], ij[2], k) 
 
+# In case of a serial grid, the interior computations are performed over the whole three-dimensional
+# domain. Therefore, the `interior_active_cells` field contains the indices of all the active cells in 
+# the range 1:Nx, 1:Ny and 1:Nz (i.e., we construct the map with parameters :xyz)
 map_interior_active_cells(ibg) = interior_active_indices(ibg; parameters = :xyz)
 
-# In case of a `DistributedGrid` we want to have different maps depending on the 
-# partitioning of the domain
+# In case of a `DistributedGrid` we want to have different maps depending on the partitioning of the domain:
+#
+# If we partition the domain in the x-direction, we typically want to have the option to split three-dimensional 
+# kernels in a `halo-independent` part in the range Hx+1:Nx-Hx, 1:Ny, 1:Nz and two `halo-dependent` computations:
+# a west one spanning 1:Hx, 1:Ny, 1:Nz and an east one spanning Nx-Hx+1:Nx, 1:Ny, 1:Nz. 
+# For this reason we need three different maps, one containing the `halo_independent` active region, a `west` map and an `east` map. 
+# For the same reason we need to construct `south` and `north` maps if we partition the domain in the y-direction.
+# Therefore, the `interior_active_cells` in this case is a `NamedTuple` containing 5 elements.
+# Note that boundary-adjacent maps corresponding to non-partitioned directions are set to `nothing`
 function map_interior_active_cells(ibg::ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:DistributedGrid})
 
     arch = architecture(ibg)
+
+    # If we using a synchronized architecture, nothing
+    # changes with serial execution.
+    if arch isa SynchronizedDistributed
+        return interior_active_indices(ibg; parameters = :xyz)
+    end
+
     Rx, Ry, _  = arch.ranks
     Tx, Ty, _  = topology(ibg)
     Nx, Ny, Nz = size(ibg)
@@ -213,10 +232,10 @@ function map_interior_active_cells(ibg::ImmersedBoundaryGrid{<:Any, <:Any, <:Any
     include_south = !isa(ibg, YFlatGrid) && (Ry != 1) && !(Ty == RightConnected)
     include_north = !isa(ibg, YFlatGrid) && (Ry != 1) && !(Ty == LeftConnected)
 
-    west  = include_west  ? interior_active_indices(ibg; parameters = KernelParameters(x_boundary, left_offsets))    : nothing
-    east  = include_east  ? interior_active_indices(ibg; parameters = KernelParameters(x_boundary, right_x_offsets)) : nothing
-    south = include_south ? interior_active_indices(ibg; parameters = KernelParameters(y_boundary, left_offsets))    : nothing
-    north = include_north ? interior_active_indices(ibg; parameters = KernelParameters(y_boundary, right_y_offsets)) : nothing
+    west_halo_dependent_cells  = include_west  ? interior_active_indices(ibg; parameters = KernelParameters(x_boundary, left_offsets))    : nothing
+    east_halo_dependent_cells  = include_east  ? interior_active_indices(ibg; parameters = KernelParameters(x_boundary, right_x_offsets)) : nothing
+    south_halo_dependent_cells = include_south ? interior_active_indices(ibg; parameters = KernelParameters(y_boundary, left_offsets))    : nothing
+    north_halo_dependent_cells = include_north ? interior_active_indices(ibg; parameters = KernelParameters(y_boundary, right_y_offsets)) : nothing
     
     nx = Rx == 1 ? Nx : (Tx == RightConnected || Tx == LeftConnected ? Nx - Hx : Nx - 2Hx)
     ny = Ry == 1 ? Ny : (Ty == RightConnected || Ty == LeftConnected ? Ny - Hy : Ny - 2Hy)
@@ -224,9 +243,13 @@ function map_interior_active_cells(ibg::ImmersedBoundaryGrid{<:Any, <:Any, <:Any
     ox = Rx == 1 || Tx == RightConnected ? 0 : Hx
     oy = Ry == 1 || Ty == RightConnected ? 0 : Hy
      
-    interior = interior_active_indices(ibg; parameters = KernelParameters((nx, ny, Nz), (ox, oy, 0)))
+    halo_independent_cells = interior_active_indices(ibg; parameters = KernelParameters((nx, ny, Nz), (ox, oy, 0)))
 
-    return (; interior, west, east, south, north)
+    return (; halo_independent_cells, 
+              west_halo_dependent_cells, 
+              east_halo_dependent_cells, 
+              south_halo_dependent_cells, 
+              north_halo_dependent_cells)
 end
 
 # If we eventually want to perform also barotropic step, `w` computation and `p` 
diff --git a/src/ImmersedBoundaries/conditional_fluxes.jl b/src/ImmersedBoundaries/conditional_fluxes.jl
index e3abc82059..260e397c75 100644
--- a/src/ImmersedBoundaries/conditional_fluxes.jl
+++ b/src/ImmersedBoundaries/conditional_fluxes.jl
@@ -1,6 +1,8 @@
 using Oceananigans.Advection: AbstractAdvectionScheme, advection_buffers
 using Oceananigans.Operators: ℑxᶠᵃᵃ, ℑxᶜᵃᵃ, ℑyᵃᶠᵃ, ℑyᵃᶜᵃ, ℑzᵃᵃᶠ, ℑzᵃᵃᶜ 
 using Oceananigans.TurbulenceClosures: AbstractTurbulenceClosure, AbstractTimeDiscretization
+using Oceananigans.Advection: LOADV, HOADV, WENO, TracerAdvection
+using Oceananigans.Fields: ZeroField
 
 const ATC = AbstractTurbulenceClosure
 const ATD = AbstractTimeDiscretization
@@ -80,6 +82,48 @@ end
 @inline _advective_tracer_flux_y(i, j, k, ibg::IBG, args...) = conditional_flux_cfc(i, j, k, ibg, zero(ibg), advective_tracer_flux_y(i, j, k, ibg, args...))
 @inline _advective_tracer_flux_z(i, j, k, ibg::IBG, args...) = conditional_flux_ccf(i, j, k, ibg, zero(ibg), advective_tracer_flux_z(i, j, k, ibg, args...))
 
+# Disambiguation for tracer fluxes....
+@inline _advective_tracer_flux_x(i, j, k, ibg::IBG, advection::TracerAdvection, args...) =
+        _advective_tracer_flux_x(i, j, k, ibg, advection.x, args...)
+
+@inline _advective_tracer_flux_y(i, j, k, ibg::IBG, advection::TracerAdvection, args...) =
+        _advective_tracer_flux_y(i, j, k, ibg, advection.y, args...)
+
+@inline _advective_tracer_flux_z(i, j, k, ibg::IBG, advection::TracerAdvection, args...) =
+        _advective_tracer_flux_z(i, j, k, ibg, advection.z, args...)
+
+# Fallback for `nothing` advection
+@inline _advective_tracer_flux_x(i, j, k, ibg::IBG, ::Nothing, args...) = zero(ibg)
+@inline _advective_tracer_flux_y(i, j, k, ibg::IBG, ::Nothing, args...) = zero(ibg)
+@inline _advective_tracer_flux_z(i, j, k, ibg::IBG, ::Nothing, args...) = zero(ibg)
+
+# Fallback for `nothing` advection and `ZeroField` tracers and velocities
+@inline _advective_tracer_flux_x(i, j, k, ibg::IBG, ::Nothing, ::ZeroField, ::ZeroField) = zero(ibg)
+@inline _advective_tracer_flux_y(i, j, k, ibg::IBG, ::Nothing, ::ZeroField, ::ZeroField) = zero(ibg)
+@inline _advective_tracer_flux_z(i, j, k, ibg::IBG, ::Nothing, ::ZeroField, ::ZeroField) = zero(ibg)
+
+@inline _advective_tracer_flux_x(i, j, k, ibg::IBG, ::Nothing, U, ::ZeroField) = zero(ibg)
+@inline _advective_tracer_flux_y(i, j, k, ibg::IBG, ::Nothing, V, ::ZeroField) = zero(ibg)
+@inline _advective_tracer_flux_z(i, j, k, ibg::IBG, ::Nothing, W, ::ZeroField) = zero(ibg)
+@inline _advective_tracer_flux_x(i, j, k, ibg::IBG, ::Nothing, ::ZeroField, c) = zero(ibg)
+@inline _advective_tracer_flux_y(i, j, k, ibg::IBG, ::Nothing, ::ZeroField, c) = zero(ibg)
+@inline _advective_tracer_flux_z(i, j, k, ibg::IBG, ::Nothing, ::ZeroField, c) = zero(ibg)
+
+# Fallback for `ZeroField` tracers and velocities
+@inline _advective_tracer_flux_x(i, j, k, ibg::IBG, scheme, ::ZeroField, ::ZeroField) = zero(ibg)
+@inline _advective_tracer_flux_y(i, j, k, ibg::IBG, scheme, ::ZeroField, ::ZeroField) = zero(ibg)
+@inline _advective_tracer_flux_z(i, j, k, ibg::IBG, scheme, ::ZeroField, ::ZeroField) = zero(ibg)
+
+# Fallback for `ZeroField` tracers
+@inline _advective_tracer_flux_x(i, j, k, ibg::IBG, scheme, U, ::ZeroField) = zero(ibg)
+@inline _advective_tracer_flux_y(i, j, k, ibg::IBG, scheme, V, ::ZeroField) = zero(ibg)
+@inline _advective_tracer_flux_z(i, j, k, ibg::IBG, scheme, W, ::ZeroField) = zero(ibg)
+
+# Fallback for `ZeroField` velocities
+@inline _advective_tracer_flux_x(i, j, k, ibg::IBG, scheme, ::ZeroField, c) = zero(ibg)
+@inline _advective_tracer_flux_y(i, j, k, ibg::IBG, scheme, ::ZeroField, c) = zero(ibg)
+@inline _advective_tracer_flux_z(i, j, k, ibg::IBG, scheme, ::ZeroField, c) = zero(ibg)
+
 #####
 ##### "Boundary-aware" reconstruct
 #####
@@ -87,25 +131,23 @@ end
 #####
 
 """
-    calc_inactive_stencil(buffer, shift, dir, side;
-                          xside = :ᶠ, yside = :ᶠ, zside = :ᶠ,
-                          xshift = 0, yshift = 0, zshift = 0) 
+    inside_immersed_boundary(buffer, shift, dir, side;
+                             xside = :ᶠ, yside = :ᶠ, zside = :ᶠ) 
 
-Calculate the correct stencil needed for each indiviual reconstruction (i.e., symmetric, left biased and right biased, 
-on `Face`s and on `Center`s)
+Check if the stencil required for reconstruction contains immersed nodes 
 
 Example
 =======
 
 ```
-julia> calc_inactive_cells(2, :none, :z, :ᶜ)
+julia> inside_immersed_boundary(2, :none, :z, :ᶜ)
 4-element Vector{Any}:
  :(inactive_node(i, j, k + -1, ibg, c, c, f))
  :(inactive_node(i, j, k + 0,  ibg, c, c, f))
  :(inactive_node(i, j, k + 1,  ibg, c, c, f))
  :(inactive_node(i, j, k + 2,  ibg, c, c, f))
 
-julia> calc_inactive_cells(3, :left, :x, :ᶠ)
+julia> inside_immersed_boundary(3, :left, :x, :ᶠ)
 5-element Vector{Any}:
  :(inactive_node(i + -3, j, k, ibg, c, c, c))
  :(inactive_node(i + -2, j, k, ibg, c, c, c))
@@ -114,102 +156,101 @@ julia> calc_inactive_cells(3, :left, :x, :ᶠ)
  :(inactive_node(i + 1,  j, k, ibg, c, c, c))
 ```
 """
-@inline function calc_inactive_stencil(buffer, shift, dir, side;
-                                       xside = :ᶠ, yside = :ᶠ, zside = :ᶠ,
-                                       xshift = 0, yshift = 0, zshift = 0)
+@inline function inside_immersed_boundary(buffer, shift, dir, side;
+                                          xside = :ᶠ, yside = :ᶠ, zside = :ᶠ)
 
     N = buffer * 2
     if shift != :none
         N -=1
     end
-    inactive_cells  = Vector(undef, N)
 
-    rng = 1:N
-    if shift == :right
-        rng = rng .+ 1
+    if shift == :interior
+        rng = 1:N+1
+    elseif shift == :right
+        rng = 2:N+1
+    else
+        rng = 1:N
     end
 
+    inactive_cells  = Vector(undef, length(rng))
+
     for (idx, n) in enumerate(rng)
         c = side == :ᶠ ? n - buffer - 1 : n - buffer 
         xflipside = xside == :ᶠ ? :c : :f
         yflipside = yside == :ᶠ ? :c : :f
         zflipside = zside == :ᶠ ? :c : :f
         inactive_cells[idx] =  dir == :x ? 
-                               :(inactive_node(i + $(c + xshift), j + $yshift, k + $zshift, ibg, $xflipside, $yflipside, $zflipside)) :
+                               :(inactive_node(i + $c, j, k, ibg, $xflipside, $yflipside, $zflipside)) :
                                dir == :y ?
-                               :(inactive_node(i + $xshift, j + $(c + yshift), k + $zshift, ibg, $xflipside, $yflipside, $zflipside)) :
-                               :(inactive_node(i + $xshift, j + $yshift, k + $(c + zshift), ibg, $xflipside, $yflipside, $zflipside))
+                               :(inactive_node(i, j + $c, k, ibg, $xflipside, $yflipside, $zflipside)) :
+                               :(inactive_node(i, j, k + $c, ibg, $xflipside, $yflipside, $zflipside))
     end
 
     return inactive_cells
 end
 
-for (bias, shift) in zip((:symmetric, :left_biased, :right_biased), (:none, :left, :right)), side in (:ᶜ, :ᶠ)
-    near_x_boundary = Symbol(:near_x_immersed_boundary_, bias, side)
-    near_y_boundary = Symbol(:near_y_immersed_boundary_, bias, side)
-    near_z_boundary = Symbol(:near_z_immersed_boundary_, bias, side)
+for side in (:ᶜ, :ᶠ)
+    near_x_boundary_symm = Symbol(:near_x_immersed_boundary_symmetric, side)
+    near_y_boundary_symm = Symbol(:near_y_immersed_boundary_symmetric, side)
+    near_z_boundary_symm = Symbol(:near_z_immersed_boundary_symmetric, side)
+
+    near_x_boundary_bias = Symbol(:near_x_immersed_boundary_biased, side)
+    near_y_boundary_bias = Symbol(:near_y_immersed_boundary_biased, side)
+    near_z_boundary_bias = Symbol(:near_z_immersed_boundary_biased, side)
 
     @eval begin
-        @inline $near_x_boundary(i, j, k, ibg, ::AbstractAdvectionScheme{0}) = false
-        @inline $near_y_boundary(i, j, k, ibg, ::AbstractAdvectionScheme{0}) = false
-        @inline $near_z_boundary(i, j, k, ibg, ::AbstractAdvectionScheme{0}) = false
-    end
+        @inline $near_x_boundary_symm(i, j, k, ibg, ::AbstractAdvectionScheme{0}, args...) = false
+        @inline $near_y_boundary_symm(i, j, k, ibg, ::AbstractAdvectionScheme{0}, args...) = false
+        @inline $near_z_boundary_symm(i, j, k, ibg, ::AbstractAdvectionScheme{0}, args...) = false
 
-    for buffer in advection_buffers
-        @eval begin
-            @inline $near_x_boundary(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) = @inbounds (|)($(calc_inactive_stencil(buffer, shift, :x, side; xside = side)...))
-            @inline $near_y_boundary(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) = @inbounds (|)($(calc_inactive_stencil(buffer, shift, :y, side; yside = side)...))
-            @inline $near_z_boundary(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) = @inbounds (|)($(calc_inactive_stencil(buffer, shift, :z, side; zside = side)...))
-        end
+        @inline $near_x_boundary_bias(i, j, k, ibg, ::AbstractAdvectionScheme{0}, args...) = false
+        @inline $near_y_boundary_bias(i, j, k, ibg, ::AbstractAdvectionScheme{0}, args...) = false
+        @inline $near_z_boundary_bias(i, j, k, ibg, ::AbstractAdvectionScheme{0}, args...) = false
     end
-end
 
-# Horizontal inactive stencil calculation for vector invariant WENO schemes that use velocity as a smoothness indicator
-for (bias, shift) in zip((:symmetric, :left_biased, :right_biased), (:none, :left, :right))
-    near_x_horizontal_boundary = Symbol(:near_x_horizontal_boundary_, bias)
-    near_y_horizontal_boundary = Symbol(:near_y_horizontal_boundary_, bias)
-    
     for buffer in advection_buffers
         @eval begin
-            @inline $near_x_horizontal_boundary(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) = 
-                @inbounds (|)($(calc_inactive_stencil(buffer+1, shift, :x, :ᶜ; yside = :ᶜ)...), 
-                              $(calc_inactive_stencil(buffer,   shift, :x, :ᶜ; xside = :ᶜ)...), 
-                              $(calc_inactive_stencil(buffer,   shift, :x, :ᶜ; xside = :ᶜ, yshift = 1)...))
-
-            @inline $near_y_horizontal_boundary(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) =
-                @inbounds (|)($(calc_inactive_stencil(buffer+1, shift, :y, :ᶜ; xside = :ᶜ)...),
-                              $(calc_inactive_stencil(buffer,   shift, :y, :ᶜ; yside = :ᶜ)...),
-                              $(calc_inactive_stencil(buffer,   shift, :y, :ᶜ; yside = :ᶜ, xshift = 1)...))
+            @inline $near_x_boundary_symm(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) = (|)($(inside_immersed_boundary(buffer, :none, :x, side; xside = side)...))
+            @inline $near_y_boundary_symm(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) = (|)($(inside_immersed_boundary(buffer, :none, :y, side; yside = side)...))
+            @inline $near_z_boundary_symm(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) = (|)($(inside_immersed_boundary(buffer, :none, :z, side; zside = side)...))
+    
+            @inline $near_x_boundary_bias(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) = (|)($(inside_immersed_boundary(buffer, :interior, :x, side; xside = side)...))
+            @inline $near_y_boundary_bias(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) = (|)($(inside_immersed_boundary(buffer, :interior, :y, side; yside = side)...))
+            @inline $near_z_boundary_bias(i, j, k, ibg, ::AbstractAdvectionScheme{$buffer}) = (|)($(inside_immersed_boundary(buffer, :interior, :z, side; zside = side)...))
         end
     end
 end
 
-using Oceananigans.Advection: LOADV, HOADV, WENO
-using Oceananigans.Advection: AbstractSmoothnessStencil, VelocityStencil, DefaultStencil
-
-for bias in (:symmetric, :left_biased, :right_biased)
+for bias in (:symmetric, :biased)
     for (d, ξ) in enumerate((:x, :y, :z))
-
         code = [:ᵃ, :ᵃ, :ᵃ]
 
-        for loc in (:ᶜ, :ᶠ)
+        for loc in (:ᶜ, :ᶠ), alt in (:_, :__, :___, :____, :_____)
             code[d] = loc
             interp = Symbol(bias, :_interpolate_, ξ, code...)
-            alt_interp = Symbol(:_, interp)
-
-            near_boundary = Symbol(:near_, ξ, :_immersed_boundary_, bias, loc)
-
+            alt_interp = Symbol(alt, interp)
             @eval begin
                 import Oceananigans.Advection: $alt_interp
                 using Oceananigans.Advection: $interp
+            end
+        end
 
-                # Fallback for low order interpolation
-                @inline $alt_interp(i, j, k, ibg::ImmersedBoundaryGrid, scheme::LOADV, args...) = $interp(i, j, k, ibg.underlying_grid, scheme, args...)
+        for loc in (:ᶜ, :ᶠ), (alt1, alt2) in zip((:_, :__, :___, :____, :_____), (:_____, :_, :__, :___, :____))
+            code[d] = loc
+            interp = Symbol(bias, :_interpolate_, ξ, code...)
+            alt1_interp = Symbol(alt1, interp)
+            alt2_interp = Symbol(alt2, interp)
 
+            near_boundary = Symbol(:near_, ξ, :_immersed_boundary_, bias, loc)
+
+            @eval begin
+                # Fallback for low order interpolation
+                @inline $alt1_interp(i, j, k, ibg::ImmersedBoundaryGrid, scheme::LOADV, args...) = $interp(i, j, k, ibg, scheme, args...)
+                
                 # Conditional high-order interpolation in Bounded directions
-                @inline $alt_interp(i, j, k, ibg::ImmersedBoundaryGrid, scheme::HOADV, args...) =
+                @inline $alt1_interp(i, j, k, ibg::ImmersedBoundaryGrid, scheme::HOADV, args...) =
                     ifelse($near_boundary(i, j, k, ibg, scheme),
-                           $alt_interp(i, j, k, ibg, scheme.buffer_scheme, args...),
+                           $alt2_interp(i, j, k, ibg, scheme.buffer_scheme, args...),
                            $interp(i, j, k, ibg, scheme, args...))
             end
         end
diff --git a/src/ImmersedBoundaries/distributed_immersed_boundaries.jl b/src/ImmersedBoundaries/distributed_immersed_boundaries.jl
index 726edda8be..4b19202b92 100644
--- a/src/ImmersedBoundaries/distributed_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/distributed_immersed_boundaries.jl
@@ -26,12 +26,16 @@ function with_halo(new_halo, grid::DistributedImmersedBoundaryGrid)
     return ImmersedBoundaryGrid(new_underlying_grid, new_immersed_boundary)
 end
 
-function scatter_local_grids(arch::Distributed, global_grid::ImmersedBoundaryGrid, local_size)
+function scatter_local_grids(global_grid::ImmersedBoundaryGrid, arch::Distributed, local_size)
     ib = global_grid.immersed_boundary
     ug = global_grid.underlying_grid
 
-    local_ug = scatter_local_grids(arch, ug, local_size)
-    local_ib = getnamewrapper(ib)(partition_global_array(arch, ib.bottom_height, local_size))
+    local_ug = scatter_local_grids(ug, arch, local_size)
+
+    # Kinda hacky
+    local_bottom_height = partition(ib.bottom_height, arch, local_size)
+    ImmersedBoundaryConstructor = getnamewrapper(ib)
+    local_ib = ImmersedBoundaryConstructor(local_bottom_height)
     
     return ImmersedBoundaryGrid(local_ug, local_ib)
 end
diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index f456476f61..e007f461a1 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -54,15 +54,15 @@ Keyword Arguments
 GridFittedBottom(bottom_height) = GridFittedBottom(bottom_height, CenterImmersedCondition())
 
 function Base.summary(ib::GridFittedBottom)
-    hmax  = maximum(ib.bottom_height)
-    hmin  = minimum(ib.bottom_height)
-    hmean = mean(ib.bottom_height)
+    zmax  = maximum(ib.bottom_height)
+    zmin  = minimum(ib.bottom_height)
+    zmean = mean(ib.bottom_height)
 
     summary1 = "GridFittedBottom("
 
-    summary2 = string("mean(z)=", prettysummary(hmean),
-                      ", min(z)=", prettysummary(hmin),
-                      ", max(z)=", prettysummary(hmax))
+    summary2 = string("mean(z)=", prettysummary(zmean),
+                      ", min(z)=", prettysummary(zmin),
+                      ", max(z)=", prettysummary(zmax))
 
     summary3 = ")"
 
@@ -87,6 +87,7 @@ Computes `ib.bottom_height` and wraps it in a Field.
 function ImmersedBoundaryGrid(grid, ib::GridFittedBottom)
     bottom_field = Field{Center, Center, Nothing}(grid)
     set!(bottom_field, ib.bottom_height)
+    @apply_regionally clamp_bottom_height!(bottom_field, grid)
     fill_halo_regions!(bottom_field)
     new_ib = GridFittedBottom(bottom_field, ib.immersed_condition)
     TX, TY, TZ = topology(grid)
diff --git a/src/ImmersedBoundaries/grid_fitted_boundary.jl b/src/ImmersedBoundaries/grid_fitted_boundary.jl
index 2805a78e30..3beabcae24 100644
--- a/src/ImmersedBoundaries/grid_fitted_boundary.jl
+++ b/src/ImmersedBoundaries/grid_fitted_boundary.jl
@@ -1,5 +1,11 @@
 using OffsetArrays
 
+"""
+
+   GridFittedBoundary(mask)
+
+Return a immersed boundary with a three-dimensional `mask`.
+"""
 struct GridFittedBoundary{M} <: AbstractGridFittedBoundary
     mask :: M
 end
diff --git a/src/ImmersedBoundaries/immersed_boundary_condition.jl b/src/ImmersedBoundaries/immersed_boundary_condition.jl
index 34a7981c3c..423b2cd381 100644
--- a/src/ImmersedBoundaries/immersed_boundary_condition.jl
+++ b/src/ImmersedBoundaries/immersed_boundary_condition.jl
@@ -4,7 +4,7 @@ using Oceananigans.BoundaryConditions: DefaultBoundaryCondition
 using Oceananigans.TurbulenceClosures: AbstractScalarDiffusivity, h_diffusivity, z_diffusivity
 using Oceananigans.Operators: index_left, index_right, Δx, Δy, Δz, div
 
-import Oceananigans.BoundaryConditions: regularize_immersed_boundary_condition, bc_str
+import Oceananigans.BoundaryConditions: regularize_immersed_boundary_condition, bc_str, update_boundary_condition!
 
 import Oceananigans.TurbulenceClosures: immersed_∂ⱼ_τ₁ⱼ,
                                         immersed_∂ⱼ_τ₂ⱼ,
@@ -272,6 +272,9 @@ Adapt.adapt_structure(to, bc::ImmersedBoundaryCondition) = ImmersedBoundaryCondi
                                                                                      Adapt.adapt(to, bc.north),
                                                                                      Adapt.adapt(to, bc.bottom),
                                                                                      Adapt.adapt(to, bc.top))
+
+update_boundary_condition!(bc::ImmersedBoundaryCondition, args...) = nothing
+
 #####
 ##### Alternative implementation for immersed flux divergence
 #####
diff --git a/src/ImmersedBoundaries/mask_immersed_field.jl b/src/ImmersedBoundaries/mask_immersed_field.jl
index 4f8f0afbad..9ae23d88c3 100644
--- a/src/ImmersedBoundaries/mask_immersed_field.jl
+++ b/src/ImmersedBoundaries/mask_immersed_field.jl
@@ -1,5 +1,6 @@
 using KernelAbstractions: @kernel, @index
 using Statistics
+using Oceananigans.AbstractOperations: BinaryOperation
 using Oceananigans.Fields: location, ZReducedField, Field
 
 instantiate(T::Type) = T()
@@ -9,6 +10,26 @@ mask_immersed_field!(field, grid, loc, value) = nothing
 mask_immersed_field!(field::Field, value=zero(eltype(field.grid))) =
     mask_immersed_field!(field, field.grid, location(field), value)
 
+mask_immersed_field!(::Number, args...) = nothing
+
+function mask_immersed_field!(bop::BinaryOperation{<:Any, <:Any, <:Any, typeof(+)}, value=zero(eltype(bop)))
+    a_value = ifelse(bop.b isa Number, -bop.b, value)
+    mask_immersed_field!(bop.a, a_value)
+
+    b_value = ifelse(bop.a isa Number, -bop.a, value)
+    mask_immersed_field!(bop.b, b_value)
+    return nothing
+end
+
+function mask_immersed_field!(bop::BinaryOperation{<:Any, <:Any, <:Any, typeof(-)}, value=zero(eltype(bop)))
+    a_value = ifelse(bop.b isa Number, bop.b, value)
+    mask_immersed_field!(bop.a, a_value)
+
+    b_value = ifelse(bop.a isa Number, bop.a, value)
+    mask_immersed_field!(bop.b, b_value)
+    return nothing
+end
+
 """
     mask_immersed_field!(field::Field, grid::ImmersedBoundaryGrid, loc, value)
 
@@ -21,7 +42,6 @@ function mask_immersed_field!(field::Field, grid::ImmersedBoundaryGrid, loc, val
     return nothing
 end
 
-
 @kernel function _mask_immersed_field!(field, loc, grid, value)
     i, j, k = @index(Global, NTuple)
     @inbounds field[i, j, k] = scalar_mask(i, j, k, grid, grid.immersed_boundary, loc..., value, field)
@@ -32,6 +52,26 @@ mask_immersed_field_xy!(::Nothing, args...; kw...) = nothing
 mask_immersed_field_xy!(field, value=zero(eltype(field.grid)); k, mask = peripheral_node) =
     mask_immersed_field_xy!(field, field.grid, location(field), value; k, mask)
 
+mask_immersed_field_xy!(::Number, args...) = nothing
+
+function mask_immersed_field_xy!(bop::BinaryOperation{<:Any, <:Any, <:Any, typeof(+)}, value=zero(eltype(bop)))
+    a_value = ifelse(bop.b isa Number, -bop.b, value)
+    mask_immersed_field_xy!(bop.a, a_value)
+
+    b_value = ifelse(bop.a isa Number, -bop.a, value)
+    mask_immersed_field_xy!(bop.b, b_value)
+    return nothing
+end
+
+function mask_immersed_field_xy!(bop::BinaryOperation{<:Any, <:Any, <:Any, typeof(-)}, value=zero(eltype(bop)))
+    a_value = ifelse(bop.b isa Number, bop.b, value)
+    mask_immersed_field_xy!(bop.a, a_value)
+
+    b_value = ifelse(bop.a isa Number, bop.a, value)
+    mask_immersed_field_xy!(bop.b, b_value)
+    return nothing
+end
+
 """
     mask_immersed_field_xy!(field::Field, grid::ImmersedBoundaryGrid, loc, value; k, mask=peripheral_node)
 
diff --git a/src/ImmersedBoundaries/partial_cell_bottom.jl b/src/ImmersedBoundaries/partial_cell_bottom.jl
index cce15e6e06..d08f6db815 100644
--- a/src/ImmersedBoundaries/partial_cell_bottom.jl
+++ b/src/ImmersedBoundaries/partial_cell_bottom.jl
@@ -11,18 +11,18 @@ struct PartialCellBottom{H, E} <: AbstractGridFittedBottom{H}
     minimum_fractional_cell_height :: E
 end
 
-const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PartialCellBottom}
+const PCBIBG{FT, TX, TY, TZ} = ImmersedBoundaryGrid{FT, TX, TY, TZ, <:Any, <:PartialCellBottom} where {FT, TX, TY, TZ}
 
 function Base.summary(ib::PartialCellBottom)
-    hmax = maximum(parent(ib.bottom_height))
-    hmin = minimum(parent(ib.bottom_height))
-    hmean = mean(parent(ib.bottom_height))
+    zmax = maximum(parent(ib.bottom_height))
+    zmin = minimum(parent(ib.bottom_height))
+    zmean = mean(parent(ib.bottom_height))
 
     summary1 = "PartialCellBottom("
 
-    summary2 = string("mean(z)=", prettysummary(hmean),
-                      ", min(z)=", prettysummary(hmin),
-                      ", max(z)=", prettysummary(hmax),
+    summary2 = string("mean(zb)=", prettysummary(zmean),
+                      ", min(zb)=", prettysummary(zmin),
+                      ", max(zb)=", prettysummary(zmax),
                       ", ϵ=", prettysummary(ib.minimum_fractional_cell_height))
 
     summary3 = ")"
@@ -63,6 +63,7 @@ end
 function ImmersedBoundaryGrid(grid, ib::PartialCellBottom)
     bottom_field = Field{Center, Center, Nothing}(grid)
     set!(bottom_field, ib.bottom_height)
+    @apply_regionally clamp_bottom_height!(bottom_field, grid)
     fill_halo_regions!(bottom_field)
     new_ib = PartialCellBottom(bottom_field, ib.minimum_fractional_cell_height)
     TX, TY, TZ = topology(grid)
@@ -77,32 +78,49 @@ function on_architecture(arch, ib::PartialCellBottom{<:Field})
     return PartialCellBottom(new_bottom_height, ib.minimum_fractional_cell_height)
 end
 
-Adapt.adapt_structure(to, ib::PartialCellBottom) = PartialCellBottom(adapt(to, ib.bottom_height.data),
+Adapt.adapt_structure(to, ib::PartialCellBottom) = PartialCellBottom(adapt(to, ib.bottom_height),
                                                                      ib.minimum_fractional_cell_height)     
 
-on_architecture(to, ib::PartialCellBottom) = PartialCellBottom(on_architecture(to, ib.bottom_height.data),
+on_architecture(to, ib::PartialCellBottom) = PartialCellBottom(on_architecture(to, ib.bottom_height),
                                                                on_architecture(to, ib.minimum_fractional_cell_height))     
 
 """
-
-        --x--
-          ∘   k+1
-    k+1 --x--    ↑      <- node z
-          ∘   k  | Δz
-      k --x--    ↓
+    immersed     underlying
+
+      --x--        --x--
+            
+            
+        ∘   ↑        ∘   k+1
+            |
+            |               
+  k+1 --x-- |  k+1 --x--    ↑      <- node z
+        ∘   ↓               |
+   zb ⋅⋅x⋅⋅                 |
+                            |
+                     ∘   k  | Δz
+                            |
+                            |
+                 k --x--    ↓
       
-Criterion is h ≥ z - ϵ Δz
+Criterion is zb ≥ z - ϵ Δz
 
 """
 @inline function _immersed_cell(i, j, k, underlying_grid, ib::PartialCellBottom)
-    # Face node above current cell
-    z = znode(i, j, k+1, underlying_grid, c, c, f)
-    h = @inbounds ib.bottom_height[i, j, 1]
-    return z ≤ h
+    # Face node below current cell
+    z  = znode(i, j, k, underlying_grid, c, c, f)
+    zb = @inbounds ib.bottom_height[i, j, 1]
+    ϵ  = ib.minimum_fractional_cell_height
+    # z + Δz is equal to the face above the current cell
+    Δz = Δzᶜᶜᶜ(i, j, k, underlying_grid)
+    return (z + Δz * (1 - ϵ)) ≤ zb
 end
 
-@inline bottom_cell(i, j, k, ibg::PCBIBG) = !immersed_cell(i, j, k,   ibg.underlying_grid, ibg.immersed_boundary) &
-                                             immersed_cell(i, j, k-1, ibg.underlying_grid, ibg.immersed_boundary)
+@inline function bottom_cell(i, j, k, ibg::PCBIBG)
+    grid = ibg.underlying_grid
+    ib = ibg.immersed_boundary
+    # This one's not immersed, but the next one down is
+    return !immersed_cell(i, j, k, grid, ib) & immersed_cell(i, j, k-1, grid, ib)
+end
 
 @inline function Δzᶜᶜᶜ(i, j, k, ibg::PCBIBG)
     underlying_grid = ibg.underlying_grid
@@ -145,4 +163,18 @@ end
 @inline Δzᶜᶠᶠ(i, j, k, ibg::PCBIBG) = min(Δzᶜᶜᶠ(i, j-1, k, ibg), Δzᶜᶜᶠ(i, j, k, ibg))      
 @inline Δzᶠᶠᶠ(i, j, k, ibg::PCBIBG) = min(Δzᶠᶜᶠ(i, j-1, k, ibg), Δzᶠᶜᶠ(i, j, k, ibg))
 
+# Make sure Δz works for horizontally-Flat topologies.
+# (There's no point in using z-Flat with PartialCellBottom).
+XFlatPCBIBG = ImmersedBoundaryGrid{<:Any, <:Flat, <:Any, <:Any, <:Any, <:PartialCellBottom}
+YFlatPCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Flat, <:Any, <:Any, <:PartialCellBottom}
+
+@inline Δzᶠᶜᶜ(i, j, k, ibg::XFlatPCBIBG) = Δzᶜᶜᶜ(i, j, k, ibg)
+@inline Δzᶠᶜᶠ(i, j, k, ibg::XFlatPCBIBG) = Δzᶜᶜᶠ(i, j, k, ibg)
+@inline Δzᶜᶠᶜ(i, j, k, ibg::YFlatPCBIBG) = Δzᶜᶜᶜ(i, j, k, ibg)
+
+@inline Δzᶜᶠᶠ(i, j, k, ibg::YFlatPCBIBG) = Δzᶜᶜᶠ(i, j, k, ibg)
+@inline Δzᶠᶠᶜ(i, j, k, ibg::XFlatPCBIBG) = Δzᶜᶠᶜ(i, j, k, ibg)
+@inline Δzᶠᶠᶜ(i, j, k, ibg::YFlatPCBIBG) = Δzᶠᶜᶜ(i, j, k, ibg)
+
 @inline z_bottom(i, j, ibg::PCBIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j, 1]
+
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
index ca7214e82c..375ab00883 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
@@ -2,7 +2,7 @@ import Oceananigans.Models: compute_boundary_tendencies!
 import Oceananigans.Models: compute_boundary_tendencies!
 
 using Oceananigans.Grids: halo_size
-using Oceananigans.ImmersedBoundaries: active_interior_map, DistributedActiveCellsIBG
+using Oceananigans.ImmersedBoundaries: retrieve_interior_active_cells_map, DistributedActiveCellsIBG
 using Oceananigans.Models.NonhydrostaticModels: boundary_tendency_kernel_parameters,
                                                 boundary_p_kernel_parameters, 
                                                 boundary_κ_kernel_parameters,
@@ -38,13 +38,15 @@ function compute_boundary_tendency_contributions!(grid::DistributedActiveCellsIB
     maps = grid.interior_active_cells
     
     for (name, map) in zip(keys(maps), maps)
-        compute_boundary = (name != :interior) && !isnothing(map) 
         
-        # If there exists a boundary map, then we compute the boundary contributions
+        # If there exists a boundary map, then we compute the boundary contributions. If not, the 
+        # boundary contributions have already been calculated. We exclude the interior because it has
+        # already been calculated
+        compute_boundary = (name != :interior) && !isnothing(map) 
+
         if compute_boundary
-            active_boundary_map = active_interior_map(Val(name))
-            compute_hydrostatic_free_surface_tendency_contributions!(model, tuple(:xyz); 
-                                                                     active_cells_map = active_boundary_map)
+            active_cells_map = retrieve_interior_active_cells_map(grid, Val(name))
+            compute_hydrostatic_free_surface_tendency_contributions!(model, tuple(:xyz); active_cells_map)
         end
     end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
index 0c9f6b17b5..db53b860d6 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
@@ -8,10 +8,10 @@ using Oceananigans.Utils: work_layout, KernelParameters
 using Oceananigans.Grids: halo_size
 using Oceananigans.Fields: immersed_boundary_condition
 using Oceananigans.Biogeochemistry: update_tendencies!
-using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: FlavorOfCATKE
+using Oceananigans.TurbulenceClosures.TKEBasedVerticalDiffusivities: FlavorOfCATKE, FlavorOfTD
 
-using Oceananigans.ImmersedBoundaries: active_interior_map, ActiveCellsIBG, 
-                                       InteriorMap, active_linear_index_to_tuple
+using Oceananigans.ImmersedBoundaries: retrieve_interior_active_cells_map, ActiveCellsIBG, 
+                                       active_linear_index_to_tuple
 
 """
     compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
@@ -24,9 +24,10 @@ function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     kernel_parameters = tuple(interior_tendency_kernel_parameters(model.grid))
 
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
-    # interior of the domain
-    compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters;
-                                                             active_cells_map = active_interior_map(model.grid))
+    # interior of the domain. The active cells map restricts the computation to the active cells in the
+    # interior if the grid is _immersed_ and the `active_cells_map` kwarg is active
+    active_cells_map = retrieve_interior_active_cells_map(model.grid, Val(:interior))
+    compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; active_cells_map)
 
     complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
 
@@ -201,26 +202,26 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, map, args)
+@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, ::Nothing, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid::ActiveCellsIBG, map, args)
+@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, active_cells_map, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, map, grid)
+    i, j, k = active_linear_index_to_tuple(idx, active_cells_map)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, map, args)
+@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, ::Nothing, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid::ActiveCellsIBG, map, args)
+@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, active_cells_map, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, map, grid)
+    i, j, k = active_linear_index_to_tuple(idx, active_cells_map)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -229,14 +230,14 @@ end
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, map, args)
+@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, ::Nothing, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid::ActiveCellsIBG, map, args)
+@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, active_cells_map, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, map, grid)
+    i, j, k = active_linear_index_to_tuple(idx, active_cells_map)
     @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index a3c590b840..927489cf50 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -38,11 +38,8 @@ end
     Hx, Hy, _ = halo_size(grid)
     Tx, Ty, _ = topology(grid)
 
-    Sx = Tx == Flat ? Nx : Nx + 2Hx - 2
-    Sy = Ty == Flat ? Ny : Ny + 2Hy - 2
+    ii = ifelse(Tx == Flat, 1:Nx, -Hx+2:Nx+Hx-1)
+    jj = ifelse(Ty == Flat, 1:Ny, -Hy+2:Ny+Hy-1)
 
-    Ox = Tx == Flat ? 0 : - Hx + 1
-    Oy = Ty == Flat ? 0 : - Hy + 1
-
-    return KernelParameters((Sx, Sy), (Ox, Oy))
+    return KernelParameters(ii, jj)
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
index 65ee037f2e..c76ac3b525 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
@@ -1,7 +1,7 @@
 using Oceananigans.Fields: location
 using Oceananigans.TimeSteppers: ab2_step_field!
 using Oceananigans.TurbulenceClosures: implicit_step!
-using Oceananigans.ImmersedBoundaries: active_interior_map, active_surface_map
+using Oceananigans.ImmersedBoundaries: retrieve_interior_active_cells_map, retrieve_surface_active_cells_map
 
 import Oceananigans.TimeSteppers: ab2_step!
 
@@ -78,6 +78,10 @@ function ab2_step_tracers!(tracers, model, Δt, χ)
         # TODO: do better than this silly criteria, also need to check closure tuples
         if closure isa FlavorOfCATKE && tracer_name == :e
             @debug "Skipping AB2 step for e"
+        elseif closure isa FlavorOfTD && tracer_name == :ϵ
+            @debug "Skipping AB2 step for ϵ"
+        elseif closure isa FlavorOfTD && tracer_name == :e
+            @debug "Skipping AB2 step for e"
         else
             Gⁿ = model.timestepper.Gⁿ[tracer_name]
             G⁻ = model.timestepper.G⁻[tracer_name]
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
index 7e84c424e4..7bd3ee6337 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
@@ -106,10 +106,10 @@ function HydrostaticFreeSurfaceModel(; grid,
                                              clock = Clock{eltype(grid)}(time = 0),
                                 momentum_advection = CenteredSecondOrder(),
                                   tracer_advection = CenteredSecondOrder(),
-                                          buoyancy = SeawaterBuoyancy(eltype(grid)),
+                                          buoyancy = nothing,
                                           coriolis = nothing,
                                       free_surface = default_free_surface(grid, gravitational_acceleration=g_Earth),
-                                           tracers = (:T, :S),
+                                           tracers = nothing,
                                       forcing::NamedTuple = NamedTuple(),
                                            closure = nothing,
                    boundary_conditions::NamedTuple = NamedTuple(),
@@ -201,7 +201,7 @@ function HydrostaticFreeSurfaceModel(; grid,
                                         free_surface, forcing, closure, particles, biogeochemistry, velocities, tracers,
                                         pressure, diffusivity_fields, timestepper, auxiliary_fields)
 
-    update_state!(model)
+    update_state!(model; compute_tendencies = false)
 
     return model
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl
index ca21bfab5e..e54ad5c49c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl
@@ -54,7 +54,7 @@ where
 𝐮_⋆ = 𝐮^n + \\int_{t_n}^{t_{n+1}} 𝐆ᵤ \\, 𝖽t .
 ```
 
-This equation can be solved, in general, using the [`PreconditionedConjugateGradientSolver`](@ref) but 
+This equation can be solved, in general, using the [`ConjugateGradientSolver`](@ref) but 
 other solvers can be invoked in special cases.
 
 If ``H`` is constant, we divide through out to obtain
@@ -69,7 +69,7 @@ surface can be obtained using the [`FFTBasedPoissonSolver`](@ref).
 `solver_method` can be either of:
 * `:FastFourierTransform` for [`FFTBasedPoissonSolver`](@ref)
 * `:HeptadiagonalIterativeSolver`  for [`HeptadiagonalIterativeSolver`](@ref)
-* `:PreconditionedConjugateGradient` for [`PreconditionedConjugateGradientSolver`](@ref)
+* `:PreconditionedConjugateGradient` for [`ConjugateGradientSolver`](@ref)
 
 By default, if the grid has regular spacing in the horizontal directions then the `:FastFourierTransform` is chosen,
 otherwise the `:HeptadiagonalIterativeSolver`.
@@ -134,7 +134,7 @@ function implicit_free_surface_step!(free_surface::ImplicitFreeSurface, model, 
     solver = free_surface.implicit_step_solver
     arch   = model.architecture
 
-    fill_halo_regions!(model.velocities)
+    fill_halo_regions!(model.velocities, model.clock, fields(model))
 
     # Compute right hand side of implicit free surface equation
     @apply_regionally local_compute_integrated_volume_flux!(∫ᶻQ, model.velocities, arch)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/pcg_implicit_free_surface_solver.jl b/src/Models/HydrostaticFreeSurfaceModels/pcg_implicit_free_surface_solver.jl
index 22955842c1..8db6040778 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/pcg_implicit_free_surface_solver.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/pcg_implicit_free_surface_solver.jl
@@ -70,7 +70,7 @@ function PCGImplicitFreeSurfaceSolver(grid::AbstractGrid, settings, gravitationa
     # TODO: reuse solver.storage for rhs when preconditioner isa FFTImplicitFreeSurfaceSolver?
     right_hand_side = ZFaceField(grid, indices = (:, :, size(grid, 3) + 1))
 
-    solver = PreconditionedConjugateGradientSolver(implicit_free_surface_linear_operation!;
+    solver = ConjugateGradientSolver(implicit_free_surface_linear_operation!;
                                                    template_field = right_hand_side,
                                                    settings...)
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
index ce9accac70..b24eb51a06 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
@@ -54,7 +54,7 @@ function PrescribedVelocityFields(; u = ZeroField(),
 end
 
 wrap_prescribed_field(X, Y, Z, f::Function, grid; kwargs...) = FunctionField{X, Y, Z}(f, grid; kwargs...)
-wrap_prescribed_field(X, Y, Z, f, grid; kwargs...) = f
+wrap_prescribed_field(X, Y, Z, f, grid; kwargs...) = field((X, Y, Z), f, grid)
 
 function HydrostaticFreeSurfaceVelocityFields(velocities::PrescribedVelocityFields, grid, clock, bcs)
 
@@ -121,8 +121,8 @@ const OnlyParticleTrackingModel = HydrostaticFreeSurfaceModel{TS, E, A, S, G, T,
                  {TS, E, A, S, G, T, V, B, R, F, P<:AbstractLagrangianParticles, U<:PrescribedVelocityFields, C<:NamedTuple{(), Tuple{}}}
 
 function time_step!(model::OnlyParticleTrackingModel, Δt; callbacks = [], kwargs...)
-    model.timestepper.previous_Δt = Δt
     tick!(model.clock, Δt)
+    model.clock.last_Δt = Δt
     step_lagrangian_particles!(model, Δt)
     update_state!(model, callbacks)
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/set_hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/set_hydrostatic_free_surface_model.jl
index 915d0989d8..f216d312c7 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/set_hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/set_hydrostatic_free_surface_model.jl
@@ -18,8 +18,8 @@ Example
 
 ```jldoctest
 using Oceananigans
-
-model = HydrostaticFreeSurfaceModel(grid = RectilinearGrid(size=(16, 16, 16), extent=(1, 1, 1)))
+grid = RectilinearGrid(size=(16, 16, 16), extent=(1, 1, 1))
+model = HydrostaticFreeSurfaceModel(; grid, tracers=:T)
 
 # Set u to a parabolic function of z, v to random numbers damped
 # at top and bottom, and T to some silly array of half zeros,
@@ -61,7 +61,7 @@ model.velocities.u
     end
 
     initialize!(model)
-    update_state!(model)
+    update_state!(model; compute_tendencies = false)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
index ea5f343720..d4d3dec8d8 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
@@ -6,7 +6,7 @@ using Oceananigans.Grids: Flat, Bounded
 using Oceananigans.Fields: ZeroField
 using Oceananigans.Coriolis: AbstractRotation
 using Oceananigans.TurbulenceClosures: AbstractTurbulenceClosure
-using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: CATKEVDArray
+using Oceananigans.TurbulenceClosures.TKEBasedVerticalDiffusivities: CATKEVDArray
 
 import Oceananigans.Grids: validate_size, validate_halo
 import Oceananigans.Models: validate_tracer_advection
@@ -61,7 +61,7 @@ compute_free_surface_tendency!(::SingleColumnGrid, ::SplitExplicitFreeSurfaceHFS
 
 # Fast state update and halo filling
 
-function update_state!(model::HydrostaticFreeSurfaceModel, grid::SingleColumnGrid, callbacks)
+function update_state!(model::HydrostaticFreeSurfaceModel, grid::SingleColumnGrid, callbacks; compute_tendencies = true)
 
     fill_halo_regions!(prognostic_fields(model), model.clock, fields(model))
 
@@ -78,6 +78,9 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid::SingleColumnGri
     end
 
     update_biogeochemical_state!(model.biogeochemistry, model)
+    
+    compute_tendencies && 
+        @apply_regionally compute_tendencies!(model, callbacks)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/slice_ensemble_model_mode.jl b/src/Models/HydrostaticFreeSurfaceModels/slice_ensemble_model_mode.jl
index d9fda68ef1..e2963471b3 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/slice_ensemble_model_mode.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/slice_ensemble_model_mode.jl
@@ -1,11 +1,11 @@
 using Oceananigans.Grids: Flat, Bounded, y_domain
 using Oceananigans.TurbulenceClosures: AbstractTurbulenceClosure
-using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: _top_tke_flux, CATKEVDArray
+using Oceananigans.TurbulenceClosures.TKEBasedVerticalDiffusivities: _top_tke_flux, CATKEVDArray
 
 import Oceananigans.Grids: validate_size, validate_halo, XYRegularRG
 import Oceananigans.TurbulenceClosures: time_discretization, compute_diffusivities!, with_tracers
 import Oceananigans.TurbulenceClosures: ∂ⱼ_τ₁ⱼ, ∂ⱼ_τ₂ⱼ, ∂ⱼ_τ₃ⱼ, ∇_dot_qᶜ
-import Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: top_tke_flux
+import Oceananigans.TurbulenceClosures.TKEBasedVerticalDiffusivities: top_tke_flux
 import Oceananigans.Coriolis: x_f_cross_U, y_f_cross_U, z_f_cross_U
 
 #####
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index aee8c96190..4fc0d4cf92 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -5,8 +5,8 @@ using Oceananigans.BoundaryConditions
 using Oceananigans.Operators
 using Oceananigans.Architectures: convert_args
 using Oceananigans.ImmersedBoundaries: peripheral_node, immersed_inactive_node, GFBIBG
-using Oceananigans.ImmersedBoundaries: inactive_node, IBG, c, f, ZColumnMap
-using Oceananigans.ImmersedBoundaries: mask_immersed_field!, active_surface_map, active_interior_map
+using Oceananigans.ImmersedBoundaries: inactive_node, IBG, c, f
+using Oceananigans.ImmersedBoundaries: mask_immersed_field!, retrieve_surface_active_cells_map, retrieve_interior_active_cells_map
 using Oceananigans.ImmersedBoundaries: active_linear_index_to_tuple, ActiveCellsIBG, ActiveZColumnsIBG
 using Oceananigans.DistributedComputations: child_architecture
 using Oceananigans.DistributedComputations: Distributed
@@ -42,23 +42,26 @@ const μ = 1 - δ - γ - ϵ
 #   `δyᵃᶜᵃ_V` : Hardcodes NoPenetration or Periodic boundary conditions for the meridional barotropic velocity V in y direction
 #
 # The functions `η★` `U★` and `V★` represent the value of free surface, barotropic zonal and meridional velocity at time step m+1/2
-@inline δxᶠᵃᵃ_η(i, j, k, grid, T, η★::Function, args...) = δxᶠᶜᶠ(i, j, k, grid, η★, args...)
-@inline δyᵃᶠᵃ_η(i, j, k, grid, T, η★::Function, args...) = δyᶜᶠᶠ(i, j, k, grid, η★, args...)
+@inline δxᶠᵃᵃ_η(i, j, k, grid, T, η★::Function, args...) = δxᶠᵃᵃ(i, j, k, grid, η★, args...)
+@inline δyᵃᶠᵃ_η(i, j, k, grid, T, η★::Function, args...) = δyᵃᶠᵃ(i, j, k, grid, η★, args...)
 @inline δxᶜᵃᵃ_U(i, j, k, grid, T, U★::Function, args...) = δxᶜᵃᵃ(i, j, k, grid, U★, args...)
 @inline δyᵃᶜᵃ_V(i, j, k, grid, T, V★::Function, args...) = δyᵃᶜᵃ(i, j, k, grid, V★, args...)
 
-@inline δxᶠᵃᵃ_η(i, j, k, grid, ::Type{Periodic}, η★::Function, args...) = ifelse(i == 1, η★(1, j, k, grid, args...) - η★(grid.Nx, j, k, grid, args...), δxᶠᶜᶠ(i, j, k, grid, η★, args...))
-@inline δyᵃᶠᵃ_η(i, j, k, grid, ::Type{Periodic}, η★::Function, args...) = ifelse(j == 1, η★(i, 1, k, grid, args...) - η★(i, grid.Ny, k, grid, args...), δyᶜᶠᶠ(i, j, k, grid, η★, args...))
+@inline δxᶠᵃᵃ_η(i, j, k, grid, ::Type{Periodic}, η★::Function, args...) = ifelse(i == 1, η★(1, j, k, grid, args...) - η★(grid.Nx, j, k, grid, args...), δxᶠᵃᵃ(i, j, k, grid, η★, args...))
+@inline δyᵃᶠᵃ_η(i, j, k, grid, ::Type{Periodic}, η★::Function, args...) = ifelse(j == 1, η★(i, 1, k, grid, args...) - η★(i, grid.Ny, k, grid, args...), δyᵃᶠᵃ(i, j, k, grid, η★, args...))
 
 @inline δxᶜᵃᵃ_U(i, j, k, grid, ::Type{Periodic}, U★::Function, args...) = ifelse(i == grid.Nx, U★(1, j, k, grid, args...) - U★(grid.Nx, j, k, grid, args...), δxᶜᵃᵃ(i, j, k, grid, U★, args...))
 @inline δyᵃᶜᵃ_V(i, j, k, grid, ::Type{Periodic}, V★::Function, args...) = ifelse(j == grid.Ny, V★(i, 1, k, grid, args...) - V★(i, grid.Ny, k, grid, args...), δyᵃᶜᵃ(i, j, k, grid, V★, args...))
 
+@inline δxᶠᵃᵃ_η(i, j, k, grid, ::Type{FullyConnected}, η★::Function, args...) = δxᶠᶜᶠ(i, j, k, grid, η★, args...)
+@inline δyᵃᶠᵃ_η(i, j, k, grid, ::Type{FullyConnected}, η★::Function, args...) = δyᶜᶠᶠ(i, j, k, grid, η★, args...)
+
 # Enforce NoFlux conditions for `η★`
 
-@inline δxᶠᵃᵃ_η(i, j, k, grid, ::Type{Bounded},        η★::Function, args...) = ifelse(i == 1, zero(grid), δxᶠᶜᶠ(i, j, k, grid, η★, args...))
-@inline δyᵃᶠᵃ_η(i, j, k, grid, ::Type{Bounded},        η★::Function, args...) = ifelse(j == 1, zero(grid), δyᶜᶠᶠ(i, j, k, grid, η★, args...))
-@inline δxᶠᵃᵃ_η(i, j, k, grid, ::Type{RightConnected}, η★::Function, args...) = ifelse(i == 1, zero(grid), δxᶠᶜᶠ(i, j, k, grid, η★, args...))
-@inline δyᵃᶠᵃ_η(i, j, k, grid, ::Type{RightConnected}, η★::Function, args...) = ifelse(j == 1, zero(grid), δyᶜᶠᶠ(i, j, k, grid, η★, args...))
+@inline δxᶠᵃᵃ_η(i, j, k, grid, ::Type{Bounded},        η★::Function, args...) = ifelse(i == 1, zero(grid), δxᶠᵃᵃ(i, j, k, grid, η★, args...))
+@inline δyᵃᶠᵃ_η(i, j, k, grid, ::Type{Bounded},        η★::Function, args...) = ifelse(j == 1, zero(grid), δyᵃᶠᵃ(i, j, k, grid, η★, args...))
+@inline δxᶠᵃᵃ_η(i, j, k, grid, ::Type{RightConnected}, η★::Function, args...) = ifelse(i == 1, zero(grid), δxᶠᵃᵃ(i, j, k, grid, η★, args...))
+@inline δyᵃᶠᵃ_η(i, j, k, grid, ::Type{RightConnected}, η★::Function, args...) = ifelse(j == 1, zero(grid), δyᵃᶠᵃ(i, j, k, grid, η★, args...))
 
 # Enforce Impenetrability conditions for `U★` and `V★`
 
@@ -195,7 +198,7 @@ end
 
 # Barotropic Model Kernels
 # u_Δz = u * Δz
-@kernel function _barotropic_mode_kernel!(U, V, grid, u, v)
+@kernel function _barotropic_mode_kernel!(U, V, grid, ::Nothing, u, v)
     i, j  = @index(Global, NTuple)	
     k_top = grid.Nz+1
 
@@ -210,9 +213,9 @@ end
 
 # Barotropic Model Kernels
 # u_Δz = u * Δz
-@kernel function _barotropic_mode_kernel!(U, V, grid::ActiveZColumnsIBG, u, v)
+@kernel function _barotropic_mode_kernel!(U, V, grid, active_cells_map, u, v)
     idx = @index(Global, Linear)
-    i, j = active_linear_index_to_tuple(idx, ZColumnMap(), grid)
+    i, j = active_linear_index_to_tuple(idx, active_cells_map)
     k_top = grid.Nz+1
 
     @inbounds U[i, j, k_top-1] = Δzᶠᶜᶜ(i, j, 1, grid) * u[i, j, 1]
@@ -224,9 +227,13 @@ end
     end
 end
 
-compute_barotropic_mode!(U, V, grid, u, v) = 
-    launch!(architecture(grid), grid, :xy, _barotropic_mode_kernel!, U, V, grid, u, v; 
-            active_cells_map = active_surface_map(grid))
+@inline function compute_barotropic_mode!(U, V, grid, u, v) 
+    active_cells_map = retrieve_surface_active_cells_map(grid)
+
+    launch!(architecture(grid), grid, :xy, _barotropic_mode_kernel!, U, V, grid, active_cells_map, u, v; active_cells_map)
+
+    return nothing
+end
 
 function initialize_free_surface_state!(state, η, timestepper)
 
@@ -410,7 +417,7 @@ function iterate_split_explicit!(free_surface, grid, Δτᴮ, weights, ::Val{Nsu
 end
 
 # Calculate RHS for the barotropic time step.
-@kernel function _compute_integrated_ab2_tendencies!(Gᵁ, Gⱽ, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
+@kernel function _compute_integrated_ab2_tendencies!(Gᵁ, Gⱽ, grid, ::Nothing, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
     i, j  = @index(Global, NTuple)
     k_top = grid.Nz + 1
 
@@ -424,9 +431,9 @@ end
 end
 
 # Calculate RHS for the barotropic time step.q
-@kernel function _compute_integrated_ab2_tendencies!(Gᵁ, Gⱽ, grid::ActiveZColumnsIBG, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
+@kernel function _compute_integrated_ab2_tendencies!(Gᵁ, Gⱽ, grid, active_cells_map, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
     idx = @index(Global, Linear)
-    i, j = active_linear_index_to_tuple(idx, ZColumnMap(), grid)
+    i, j = active_linear_index_to_tuple(idx, active_cells_map)
     k_top = grid.Nz+1
 
     @inbounds Gᵁ[i, j, k_top-1] = Δzᶠᶜᶜ(i, j, 1, grid) * ab2_step_Gu(i, j, 1, grid, Gu⁻, Guⁿ, χ)
@@ -464,9 +471,14 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
     return nothing
 end
 
-setup_split_explicit_tendency!(auxiliary, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ) =
+@inline function setup_split_explicit_tendency!(auxiliary, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ) 
+    active_cells_map = retrieve_surface_active_cells_map(grid)
+
     launch!(architecture(grid), grid, :xy, _compute_integrated_ab2_tendencies!, auxiliary.Gᵁ, auxiliary.Gⱽ, grid, 
-            Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ; active_cells_map = active_surface_map(grid))
+            active_cells_map, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ; active_cells_map)
+
+    return nothing
+end
             
 wait_free_surface_communication!(free_surface, arch) = nothing
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
index 85f83bf932..a84d03e13f 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
@@ -4,7 +4,7 @@ using Oceananigans.TimeSteppers: store_field_tendencies!
 
 using Oceananigans: prognostic_fields
 using Oceananigans.Grids: AbstractGrid
-using Oceananigans.ImmersedBoundaries: active_interior_map
+using Oceananigans.ImmersedBoundaries: retrieve_interior_active_cells_map
 
 using Oceananigans.Utils: launch!
 
@@ -37,6 +37,10 @@ function store_tendencies!(model::HydrostaticFreeSurfaceModel)
 
         if closure isa FlavorOfCATKE && field_name == :e
             @debug "Skipping store tendencies for e"
+        elseif closure isa FlavorOfTD && field_name == :ϵ
+            @debug "Skipping store tendencies for ϵ"
+        elseif closure isa FlavorOfTD && field_name == :e
+            @debug "Skipping store tendencies for e"
         else
             launch!(model.architecture, model.grid, :xyz,
                     store_field_tendencies!,
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index b333b32ae4..b8c2739bf4 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -3,6 +3,7 @@ using Oceananigans.BoundaryConditions
 
 using Oceananigans: UpdateStateCallsite
 using Oceananigans.Biogeochemistry: update_biogeochemical_state!
+using Oceananigans.BoundaryConditions: update_boundary_condition!
 using Oceananigans.TurbulenceClosures: compute_diffusivities!
 using Oceananigans.ImmersedBoundaries: mask_immersed_field!, mask_immersed_field_xy!, inactive_node
 using Oceananigans.Models: update_model_field_time_series!
@@ -17,11 +18,12 @@ compute_auxiliary_fields!(auxiliary_fields) = Tuple(compute!(a) for a in auxilia
 # single column models.
 
 """
-    update_state!(model::HydrostaticFreeSurfaceModel, callbacks=[])
+    update_state!(model::HydrostaticFreeSurfaceModel, callbacks=[]; compute_tendencies = true)
 
 Update peripheral aspects of the model (auxiliary fields, halo regions, diffusivities,
 hydrostatic pressure) to the current model state. If `callbacks` are provided (in an array),
-they are called in the end.
+they are called in the end. Finally, the tendencies for the new time-step are computed if 
+`compute_tendencies = true`.
 """
 update_state!(model::HydrostaticFreeSurfaceModel, callbacks=[]; compute_tendencies = true) =
     update_state!(model, model.grid, callbacks; compute_tendencies)
@@ -32,6 +34,9 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; comp
     # Update possible FieldTimeSeries used in the model
     @apply_regionally update_model_field_time_series!(model, model.clock)
 
+    # Update the boundary conditions
+    @apply_regionally update_boundary_condition!(fields(model), model)
+
     if grid isa ConformalCubedSphereGrid
         prognostic_fields_minus_u_v = (; filter(kv -> kv[1] ∉ (:u, :v), pairs(prognostic_fields(model)))...)
         fill_halo_regions!(prognostic_fields_minus_u_v, model.clock, fields(model); async = true)
diff --git a/src/Models/LagrangianParticleTracking/lagrangian_particle_advection.jl b/src/Models/LagrangianParticleTracking/lagrangian_particle_advection.jl
index 443d144c4e..9ab519c23b 100644
--- a/src/Models/LagrangianParticleTracking/lagrangian_particle_advection.jl
+++ b/src/Models/LagrangianParticleTracking/lagrangian_particle_advection.jl
@@ -38,6 +38,13 @@ Do nothing on Flat dimensions.
 const f = Face()
 const c = Center()
 
+"""
+    immersed_boundary_topology(grid_topology)
+
+Unless `Flat`, immersed boundaries are treated as `Bounded` regardless of underlying grid topology.
+"""
+immersed_boundary_topology(grid_topology) = ifelse(grid_topology == Flat, Flat(), Bounded())
+
 """
     bounce_immersed_particle((x, y, z), grid, restitution, previous_particle_indices)
 
@@ -48,29 +55,45 @@ bouncing the particle off the immersed boundary with a coefficient or `restituti
     X = flattened_node((x, y, z), ibg)
 
     # Determine current particle cell
-    fi, fj, fk = fractional_indices(X, ibg.underlying_grid, (c, c, c))
+    fi, fj, fk = fractional_indices(X, ibg.underlying_grid, c, c, c)
     i, j, k = truncate_fractional_indices(fi, fj, fk)
 
-    if immersed_cell(i, j, k, ibg)
-        # Determine whether particle was _previously_ in a non-immersed cell
-        i⁻, j⁻, k⁻ = previous_particle_indices
+    # Determine whether particle was _previously_ in a non-immersed cell
+    i⁻, j⁻, k⁻ = previous_particle_indices
 
-        if !immersed_cell(i⁻, j⁻, k⁻, ibg)
-            # Left-right bounds of the previous, non-immersed cell
-            xᴿ, yᴿ, zᴿ = node(i⁻+1, j⁻+1, k⁻+1, ibg, f, f, f)
-            xᴸ, yᴸ, zᴸ = node(i⁻,   j⁻,   k⁻,   ibg, f, f, f)
+    # Left bounds of the previous cell
+    xᴿ = xnode(i⁻ + 1, j⁻ + 1, k⁻ + 1, ibg, f, f, f)
+    yᴿ = ynode(i⁻ + 1, j⁻ + 1, k⁻ + 1, ibg, f, f, f)
+    zᴿ = znode(i⁻ + 1, j⁻ + 1, k⁻ + 1, ibg, f, f, f)
 
-            Cʳ = restitution
-            x⁺ = enforce_boundary_conditions(Bounded(), x, xᴸ, xᴿ, Cʳ)
-            y⁺ = enforce_boundary_conditions(Bounded(), y, yᴸ, yᴿ, Cʳ)
-            z⁺ = enforce_boundary_conditions(Bounded(), z, zᴸ, zᴿ, Cʳ)
+    # Right bounds of the previous cell
+    xᴸ = xnode(i⁻, j⁻, k⁻, ibg, f, f, f)
+    yᴸ = ynode(i⁻, j⁻, k⁻, ibg, f, f, f)
+    zᴸ = znode(i⁻, j⁻, k⁻, ibg, f, f, f)
 
-        end
-    end
+    Cʳ = restitution
+    tx, ty, tz = map(immersed_boundary_topology, topology(ibg))
+    xb⁺ = enforce_boundary_conditions(tx, x, xᴸ, xᴿ, Cʳ)
+    yb⁺ = enforce_boundary_conditions(ty, y, yᴸ, yᴿ, Cʳ)
+    zb⁺ = enforce_boundary_conditions(tz, z, zᴸ, zᴿ, Cʳ)
 
-    return x⁺, y⁺, z⁺
+    immersed = immersed_cell(i, j, k, ibg)
+    x⁺ = ifelse(immersed, xb⁺, x)
+    y⁺ = ifelse(immersed, yb⁺, y)
+    z⁺ = ifelse(immersed, zb⁺, z)
+
+    return (x⁺, y⁺, z⁺)
 end
 
+"""
+    rightmost_interface_index(topology, N)
+
+Return the index of the rightmost cell interface for a grid with `topology` and `N` cells.
+"""
+rightmost_interface_index(::Bounded, N) = N + 1
+rightmost_interface_index(::Periodic, N) = N + 1
+rightmost_interface_index(::Flat, N) = N
+
 """
     advect_particle((x, y, z), p, restitution, grid, Δt, velocities)
 
@@ -106,9 +129,9 @@ given `velocities`, time-step `Δt, and coefficient of `restitution`.
     Nx, Ny, Nz = size(grid)
 
     # Find index of the "rightmost" cell interface
-    iᴿ = length(f, tx, Nx)
-    jᴿ = length(f, ty, Ny)
-    kᴿ = length(f, tz, Nz)
+    iᴿ = rightmost_interface_index(tx, Nx)
+    jᴿ = rightmost_interface_index(ty, Ny)
+    kᴿ = rightmost_interface_index(tz, Nz)
 
     xᴸ = xnode(1, j, k, grid, f, f, f)
     yᴸ = ynode(i, 1, k, grid, f, f, f)
@@ -125,7 +148,7 @@ given `velocities`, time-step `Δt, and coefficient of `restitution`.
     z⁺ = enforce_boundary_conditions(tz, z⁺, zᴸ, zᴿ, Cʳ)
     if grid isa ImmersedBoundaryGrid
         previous_particle_indices = current_particle_indices # particle has been advected
-        x⁺, y⁺, z⁺ = bounce_immersed_particle((x⁺, y⁺, z⁺), grid, Cʳ, previous_particle_indices)
+        (x⁺, y⁺, z⁺) = bounce_immersed_particle((x⁺, y⁺, z⁺), grid, Cʳ, previous_particle_indices)
     end
 
     return (x⁺, y⁺, z⁺)
@@ -136,11 +159,13 @@ end
 #     * Sphere metric for `LatitudeLongitudeGrid` and geographic coordinates
 @inline x_metric(i, j, grid::RectilinearGrid) = 1
 @inline x_metric(i, j, grid::LatitudeLongitudeGrid{FT}) where FT = @inbounds 1 / (grid.radius * hack_cosd(grid.φᵃᶜᵃ[j])) * FT(360 / 2π)
+@inline x_metric(i, j, grid::ImmersedBoundaryGrid) = x_metric(i, j, grid.underlying_grid)
 
 @inline y_metric(i, j, grid::RectilinearGrid) = 1
 @inline y_metric(i, j, grid::LatitudeLongitudeGrid{FT}) where FT = 1 / grid.radius * FT(360 / 2π)
+@inline y_metric(i, j, grid::ImmersedBoundaryGrid) = y_metric(i, j, grid.underlying_grid)
 
-@kernel function _advect_particles!(particles, restitution, grid::AbstractUnderlyingGrid, Δt, velocities)
+@kernel function _advect_particles!(particles, restitution, grid::AbstractGrid, Δt, velocities)
     p = @index(Global)
 
     @inbounds begin
diff --git a/src/Models/Models.jl b/src/Models/Models.jl
index fa46608b96..9399a5b503 100644
--- a/src/Models/Models.jl
+++ b/src/Models/Models.jl
@@ -21,6 +21,7 @@ using Oceananigans.Utils: Time
 import Oceananigans: initialize!
 import Oceananigans.Architectures: architecture
 import Oceananigans.TimeSteppers: reset!
+import Oceananigans.Solvers: iteration
 
 # A prototype interface for AbstractModel.
 #
diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
index 5fccd29f82..0b20dbffbf 100644
--- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
+++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
@@ -10,31 +10,56 @@ using Oceananigans.Utils
 using Oceananigans.Grids
 using Oceananigans.Solvers
 
-using Oceananigans.DistributedComputations: Distributed, DistributedFFTBasedPoissonSolver, reconstruct_global_grid   
+using Oceananigans.DistributedComputations
+using Oceananigans.DistributedComputations: reconstruct_global_grid, Distributed
+using Oceananigans.DistributedComputations: DistributedFFTBasedPoissonSolver, DistributedFourierTridiagonalPoissonSolver
 using Oceananigans.Grids: XYRegularRG, XZRegularRG, YZRegularRG, XYZRegularRG
 using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid
+using Oceananigans.Solvers: GridWithFFTSolver, GridWithFourierTridiagonalSolver 
 using Oceananigans.Utils: SumOfArrays
 
 import Oceananigans: fields, prognostic_fields
 import Oceananigans.Advection: cell_advection_timescale
 import Oceananigans.TimeSteppers: step_lagrangian_particles!
 
-function PressureSolver(arch::Distributed, local_grid::XYZRegularRG)
+function nonhydrostatic_pressure_solver(::Distributed, local_grid::XYZRegularRG)
     global_grid = reconstruct_global_grid(local_grid)
     return DistributedFFTBasedPoissonSolver(global_grid, local_grid)
 end
 
-PressureSolver(arch, grid::XYZRegularRG) = FFTBasedPoissonSolver(grid)
-PressureSolver(arch, grid::XYRegularRG)  = FourierTridiagonalPoissonSolver(grid)
-PressureSolver(arch, grid::XZRegularRG)  = FourierTridiagonalPoissonSolver(grid)
-PressureSolver(arch, grid::YZRegularRG)  = FourierTridiagonalPoissonSolver(grid)
+function nonhydrostatic_pressure_solver(::Distributed, local_grid::GridWithFourierTridiagonalSolver)
+    global_grid = reconstruct_global_grid(local_grid)
+    return DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid)
+end
+
+nonhydrostatic_pressure_solver(arch, grid::XYZRegularRG) = FFTBasedPoissonSolver(grid)
+nonhydrostatic_pressure_solver(arch, grid::GridWithFourierTridiagonalSolver) =
+    FourierTridiagonalPoissonSolver(grid)
+
+const IBGWithFFTSolver = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:GridWithFFTSolver}
+
+function nonhydrostatic_pressure_solver(arch, ibg::IBGWithFFTSolver)
+    msg = """The FFT-based pressure_solver for NonhydrostaticModels on ImmersedBoundaryGrid
+          is approximate and will probably produce velocity fields that are divergent
+          adjacent to the immersed boundary. An experimental but improved pressure_solver
+          is available which may be used by writing
 
-# *Evil grin*
-PressureSolver(arch, ibg::ImmersedBoundaryGrid) = PressureSolver(arch, ibg.underlying_grid)
+              using Oceananigans.Solvers: ConjugateGradientPoissonSolver
+              pressure_solver = ConjugateGradientPoissonSolver(grid)
 
-# fall back
-PressureSolver(arch, grid) = error("None of the implemented pressure solvers for NonhydrostaticModel \
-                                   currently support more than one stretched direction.")
+          Please report issues to https://github.com/CliMA/Oceananigans.jl/issues.
+          """
+    @warn msg
+
+    return nonhydrostatic_pressure_solver(arch, ibg.underlying_grid)
+end
+
+# fallback
+nonhydrostatic_pressure_solver(arch, grid) =
+    error("None of the implemented pressure solvers for NonhydrostaticModel \
+          are supported on $(summary(grid)).")
+
+nonhydrostatic_pressure_solver(grid) = nonhydrostatic_pressure_solver(architecture(grid), grid)
 
 #####
 ##### NonhydrostaticModel definition
@@ -85,3 +110,4 @@ include("compute_nonhydrostatic_tendencies.jl")
 include("compute_nonhydrostatic_boundary_tendencies.jl")
 
 end # module
+
diff --git a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl
index 112183d9a6..933f732d8d 100644
--- a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl
@@ -3,6 +3,9 @@ import Oceananigans.Models: compute_boundary_tendencies!
 using Oceananigans.TurbulenceClosures: required_halo_size
 using Oceananigans.Grids: XFlatGrid, YFlatGrid
 
+# TODO: the code in this file is difficult to understand.
+# Rewriting it may be helpful.
+
 # We assume here that top/bottom BC are always synched (no partitioning in z)
 function compute_boundary_tendencies!(model::NonhydrostaticModel)
     grid = model.grid
diff --git a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
index bf840e07b6..fb7e89ee64 100644
--- a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
@@ -3,8 +3,8 @@ using Oceananigans: fields, TendencyCallsite
 using Oceananigans.Utils: work_layout
 using Oceananigans.Models: complete_communication_and_compute_boundary!, interior_tendency_kernel_parameters
 
-using Oceananigans.ImmersedBoundaries: active_interior_map, ActiveCellsIBG, 
-                                       InteriorMap, active_linear_index_to_tuple
+using Oceananigans.ImmersedBoundaries: retrieve_interior_active_cells_map, ActiveCellsIBG, 
+                                       active_linear_index_to_tuple
 
 import Oceananigans.TimeSteppers: compute_tendencies!
 
@@ -27,8 +27,9 @@ function compute_tendencies!(model::NonhydrostaticModel, callbacks)
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain
     kernel_parameters = tuple(interior_tendency_kernel_parameters(model.grid))
-
-    compute_interior_tendency_contributions!(model, kernel_parameters; active_cells_map = active_interior_map(model.grid))
+    active_cells_map  = retrieve_interior_active_cells_map(model.grid, Val(:interior))
+    
+    compute_interior_tendency_contributions!(model, kernel_parameters; active_cells_map)
     complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
 
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
@@ -142,38 +143,38 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function compute_Gu!(Gu, grid, interior_map, args) 
+@kernel function compute_Gu!(Gu, grid, ::Nothing, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gu!(Gu, grid::ActiveCellsIBG, map::InteriorMap, args) 
+@kernel function compute_Gu!(Gu, grid, interior_map, args) 
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, map, grid)
+    i, j, k = active_linear_index_to_tuple(idx, interior_map)
     @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function compute_Gv!(Gv, grid, interior_map, args) 
+@kernel function compute_Gv!(Gv, grid, ::Nothing, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gv!(Gv, grid::ActiveCellsIBG, map::InteriorMap, args) 
+@kernel function compute_Gv!(Gv, grid, interior_map, args) 
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, map, grid)
+    i, j, k = active_linear_index_to_tuple(idx, interior_map)
     @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the w-velocity equation. """
-@kernel function compute_Gw!(Gw, grid, interior_map, args) 
+@kernel function compute_Gw!(Gw, grid, ::Nothing, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gw!(Gw, grid::ActiveCellsIBG, map, ::InteriorMap, args)
+@kernel function compute_Gw!(Gw, grid, interior_map, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, map, grid)
+    i, j, k = active_linear_index_to_tuple(idx, interior_map)
     @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -182,14 +183,14 @@ end
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function compute_Gc!(Gc, grid, interior_map, args)
+@kernel function compute_Gc!(Gc, grid, ::Nothing, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gc!(Gc, grid::ActiveCellsIBG, map::InteriorMap, args) 
+@kernel function compute_Gc!(Gc, grid, interior_map, args) 
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, map, grid)
+    i, j, k = active_linear_index_to_tuple(idx, interior_map)
     @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
 end
 
diff --git a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
index b5a22163c3..6769891656 100644
--- a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
+++ b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
@@ -15,7 +15,7 @@ using Oceananigans.Models: AbstractModel, NaNChecker, extract_boundary_condition
 using Oceananigans.Solvers: FFTBasedPoissonSolver
 using Oceananigans.TimeSteppers: Clock, TimeStepper, update_state!, AbstractLagrangianParticles
 using Oceananigans.TurbulenceClosures: validate_closure, with_tracers, DiffusivityFields, time_discretization, implicit_diffusion_solver
-using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: FlavorOfCATKE
+using Oceananigans.TurbulenceClosures.TKEBasedVerticalDiffusivities: FlavorOfCATKE
 using Oceananigans.Utils: tupleit
 using Oceananigans.Grids: topology
 
@@ -25,8 +25,12 @@ import Oceananigans.Models: total_velocities, default_nan_checker, timestepper
 const ParticlesOrNothing = Union{Nothing, AbstractLagrangianParticles}
 const AbstractBGCOrNothing = Union{Nothing, AbstractBiogeochemistry}
 
+# TODO: this concept may be more generally useful,
+# but for now we use it only for hydrostatic pressure anomalies for now.
+struct DefaultHydrostaticPressureAnomaly end
+
 mutable struct NonhydrostaticModel{TS, E, A<:AbstractArchitecture, G, T, B, R, SD, U, C, Φ, F,
-                                   V, S, K, BG, P, BGC, I, AF} <: AbstractModel{TS}
+                                   V, S, K, BG, P, BGC, AF} <: AbstractModel{TS}
 
          architecture :: A        # Computer `Architecture` on which `Model` is run
                  grid :: G        # Grid of physical points on which `Model` is solved
@@ -46,7 +50,6 @@ mutable struct NonhydrostaticModel{TS, E, A<:AbstractArchitecture, G, T, B, R, S
    diffusivity_fields :: K        # Container for turbulent diffusivities
           timestepper :: TS       # Object containing timestepper fields and parameters
       pressure_solver :: S        # Pressure/Poisson solver
-    immersed_boundary :: I        # Models the physics of immersed boundaries within the grid
      auxiliary_fields :: AF       # User-specified auxiliary fields for forcing functions and boundary conditions
 end
 
@@ -61,16 +64,15 @@ end
                                   closure = nothing,
           boundary_conditions::NamedTuple = NamedTuple(),
                                   tracers = (),
-                              timestepper = :QuasiAdamsBashforth2,
+                              timestepper = :RungeKutta3,
             background_fields::NamedTuple = NamedTuple(),
             particles::ParticlesOrNothing = nothing,
     biogeochemistry::AbstractBGCOrNothing = nothing,
                                velocities = nothing,
                   nonhydrostatic_pressure = CenterField(grid),
-             hydrostatic_pressure_anomaly = nothing,
+             hydrostatic_pressure_anomaly = DefaultHydrostaticPressureAnomaly(),
                        diffusivity_fields = nothing,
                           pressure_solver = nothing,
-                        immersed_boundary = nothing,
                          auxiliary_fields = NamedTuple())
 
 Construct a model for a non-hydrostatic, incompressible fluid on `grid`, using the Boussinesq
@@ -93,44 +95,42 @@ Keyword arguments
   - `tracers`: A tuple of symbols defining the names of the modeled tracers, or a `NamedTuple` of
                preallocated `CenterField`s.
   - `timestepper`: A symbol that specifies the time-stepping method. Either `:QuasiAdamsBashforth2` or
-                   `:RungeKutta3`.
+                   `:RungeKutta3` (default).
   - `background_fields`: `NamedTuple` with background fields (e.g., background flow). Default: `nothing`.
   - `particles`: Lagrangian particles to be advected with the flow. Default: `nothing`.
   - `biogeochemistry`: Biogeochemical model for `tracers`.
   - `velocities`: The model velocities. Default: `nothing`.
   - `nonhydrostatic_pressure`: The nonhydrostatic pressure field. Default: `CenterField(grid)`.
   - `hydrostatic_pressure_anomaly`: An optional field that stores the part of the nonhydrostatic pressure
-                                    in hydrostatic balance with the buoyancy field. If `nothing` (default), the anomaly
-                                    is not computed. If `CenterField(grid)`, the anomaly is precomputed by
+                                    in hydrostatic balance with the buoyancy field. If `CenterField(grid)` (default), the anomaly is precomputed by
                                     vertically integrating the buoyancy field. In this case, the `nonhydrostatic_pressure` represents
-                                    only the part of pressure that deviates from the hydrostatic anomaly.
+                                    only the part of pressure that deviates from the hydrostatic anomaly. If `nothing`, the anomaly
+                                    is not computed. 
   - `diffusivity_fields`: Diffusivity fields. Default: `nothing`.
   - `pressure_solver`: Pressure solver to be used in the model. If `nothing` (default), the model constructor
     chooses the default based on the `grid` provide.
-  - `immersed_boundary`: The immersed boundary. Default: `nothing`.
   - `auxiliary_fields`: `NamedTuple` of auxiliary fields. Default: `nothing`         
 """
 function NonhydrostaticModel(; grid,
-                                    clock = Clock{eltype(grid)}(time = 0),
-                                advection = CenteredSecondOrder(),
-                                 buoyancy = nothing,
-                                 coriolis = nothing,
+                             clock = Clock{eltype(grid)}(time = 0),
+                             advection = CenteredSecondOrder(),
+                             buoyancy = nothing,
+                             coriolis = nothing,
                              stokes_drift = nothing,
-                      forcing::NamedTuple = NamedTuple(),
-                                  closure = nothing,
-          boundary_conditions::NamedTuple = NamedTuple(),
-                                  tracers = (),
-                              timestepper = :QuasiAdamsBashforth2,
-            background_fields::NamedTuple = NamedTuple(),
-            particles::ParticlesOrNothing = nothing,
-    biogeochemistry::AbstractBGCOrNothing = nothing,
-                               velocities = nothing,
-             hydrostatic_pressure_anomaly = nothing,
-                  nonhydrostatic_pressure = CenterField(grid),
-                       diffusivity_fields = nothing,
-                          pressure_solver = nothing,
-                        immersed_boundary = nothing,
-                         auxiliary_fields = NamedTuple())
+                             forcing::NamedTuple = NamedTuple(),
+                             closure = nothing,
+                             boundary_conditions::NamedTuple = NamedTuple(),
+                             tracers = (),
+                             timestepper = :RungeKutta3,
+                             background_fields::NamedTuple = NamedTuple(),
+                             particles::ParticlesOrNothing = nothing,
+                             biogeochemistry::AbstractBGCOrNothing = nothing,
+                             velocities = nothing,
+                             hydrostatic_pressure_anomaly = DefaultHydrostaticPressureAnomaly(),
+                             nonhydrostatic_pressure = CenterField(grid),
+                             diffusivity_fields = nothing,
+                             pressure_solver = nothing,
+                             auxiliary_fields = NamedTuple())
 
     arch = architecture(grid)
 
@@ -139,6 +139,25 @@ function NonhydrostaticModel(; grid,
     # Validate pressure fields
     nonhydrostatic_pressure isa Field{Center, Center, Center} ||
         throw(ArgumentError("nonhydrostatic_pressure must be CenterField(grid)."))
+
+    if hydrostatic_pressure_anomaly isa DefaultHydrostaticPressureAnomaly
+        # Manage treatment of the hydrostatic pressure anomaly:
+
+        if !isnothing(buoyancy)
+            # Separate the hydrostatic pressure anomaly
+            # from the nonhydrostatic pressure contribution.
+            # See https://github.com/CliMA/Oceananigans.jl/issues/3677
+            # and https://github.com/CliMA/Oceananigans.jl/issues/3795.
+
+            hydrostatic_pressure_anomaly = CenterField(grid)
+        else
+            # Use a single combined pressure, saving memory and computation.
+
+            hydrostatic_pressure_anomaly = nothing
+        end
+    end
+
+    # Check validity of hydrostatic_pressure_anomaly.
     isnothing(hydrostatic_pressure_anomaly) || hydrostatic_pressure_anomaly isa Field{Center, Center, Center} ||
         throw(ArgumentError("hydrostatic_pressure_anomaly must be `nothing` or `CenterField(grid)`."))
 
@@ -189,7 +208,7 @@ function NonhydrostaticModel(; grid,
     diffusivity_fields = DiffusivityFields(diffusivity_fields, grid, tracernames(tracers), boundary_conditions, closure)
 
     if isnothing(pressure_solver)
-        pressure_solver = PressureSolver(arch, grid)
+        pressure_solver = nonhydrostatic_pressure_solver(grid)
     end
 
     # Materialize background fields
@@ -205,11 +224,10 @@ function NonhydrostaticModel(; grid,
 
     model = NonhydrostaticModel(arch, grid, clock, advection, buoyancy, coriolis, stokes_drift,
                                 forcing, closure, background_fields, particles, biogeochemistry, velocities, tracers,
-                                pressures, diffusivity_fields, timestepper, pressure_solver, immersed_boundary,
-                                auxiliary_fields)
-
-    update_state!(model)
+                                pressures, diffusivity_fields, timestepper, pressure_solver, auxiliary_fields)
 
+    update_state!(model; compute_tendencies = false)
+    
     return model
 end
 
diff --git a/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl b/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl
index 19003a9438..83dcda0807 100644
--- a/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl
+++ b/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl
@@ -1,3 +1,4 @@
+using Oceananigans.BoundaryConditions: fill_halo_regions!
 using Oceananigans.TimeSteppers: update_state!, calculate_pressure_correction!, pressure_correct_velocities!
 
 import Oceananigans.Fields: set!
@@ -39,18 +40,20 @@ function set!(model::NonhydrostaticModel; enforce_incompressibility=true, kwargs
             throw(ArgumentError("name $fldname not found in model.velocities or model.tracers."))
         end
         set!(ϕ, value)
+
+        fill_halo_regions!(ϕ, model.clock, fields(model))
     end
 
     # Apply a mask
     foreach(mask_immersed_field!, model.tracers)
     foreach(mask_immersed_field!, model.velocities)
-    update_state!(model)
+    update_state!(model; compute_tendencies = false)
 
     if enforce_incompressibility
         FT = eltype(model.grid)
         calculate_pressure_correction!(model, one(FT))
         pressure_correct_velocities!(model, one(FT))
-        update_state!(model)
+        update_state!(model; compute_tendencies = false)
     end
 
     return nothing
diff --git a/src/Models/NonhydrostaticModels/solve_for_pressure.jl b/src/Models/NonhydrostaticModels/solve_for_pressure.jl
index 23bd6b2599..c3db828018 100644
--- a/src/Models/NonhydrostaticModels/solve_for_pressure.jl
+++ b/src/Models/NonhydrostaticModels/solve_for_pressure.jl
@@ -1,93 +1,91 @@
 using Oceananigans.Operators
-using Oceananigans.Solvers: FFTBasedPoissonSolver, FourierTridiagonalPoissonSolver, solve!
 using Oceananigans.DistributedComputations: DistributedFFTBasedPoissonSolver
-using Oceananigans.Grids: XDirection, YDirection, ZDirection
-
-using PencilArrays: Permutation
+using Oceananigans.Grids: XDirection, YDirection, ZDirection, inactive_cell
+using Oceananigans.Solvers: FFTBasedPoissonSolver, FourierTridiagonalPoissonSolver
+using Oceananigans.Solvers: ConjugateGradientPoissonSolver
+using Oceananigans.Solvers: solve!
 
 #####
 ##### Calculate the right-hand-side of the non-hydrostatic pressure Poisson equation.
 #####
 
-const ZXYPermutation = Permutation{(3, 1, 2), 3}
-const ZYXPermutation = Permutation{(3, 2, 1), 3}
-
-@kernel function calculate_pressure_source_term_fft_based_solver!(rhs, grid, Δt, U★)
-    i, j, k = @index(Global, NTuple)
-    @inbounds rhs[i, j, k] = divᶜᶜᶜ(i, j, k, grid, U★.u, U★.v, U★.w) / Δt
-end
-
-@kernel function calculate_permuted_pressure_source_term_fft_based_solver!(rhs, grid, Δt, U★, ::ZXYPermutation)
+@kernel function _compute_source_term!(rhs, grid, Δt, Ũ)
     i, j, k = @index(Global, NTuple)
-    @inbounds rhs[k, i, j] = divᶜᶜᶜ(i, j, k, grid, U★.u, U★.v, U★.w) / Δt
+    active = !inactive_cell(i, j, k, grid)
+    δ = divᶜᶜᶜ(i, j, k, grid, Ũ.u, Ũ.v, Ũ.w)
+    @inbounds rhs[i, j, k] = active * δ / Δt
 end
 
-@kernel function calculate_permuted_pressure_source_term_fft_based_solver!(rhs, grid, Δt, U★, ::ZYXPermutation)
+@kernel function _fourier_tridiagonal_source_term!(rhs, ::XDirection, grid, Δt, Ũ)
     i, j, k = @index(Global, NTuple)
-    @inbounds rhs[k, j, i] = divᶜᶜᶜ(i, j, k, grid, U★.u, U★.v, U★.w) / Δt
+    active = !inactive_cell(i, j, k, grid)
+    δ = divᶜᶜᶜ(i, j, k, grid, Ũ.u, Ũ.v, Ũ.w)
+    @inbounds rhs[i, j, k] = active * Δxᶜᶜᶜ(i, j, k, grid) * δ / Δt
 end
 
-@kernel function calculate_pressure_source_term_fourier_tridiagonal_solver!(rhs, grid, Δt, U★, ::XDirection)
+@kernel function _fourier_tridiagonal_source_term!(rhs, ::YDirection, grid, Δt, Ũ)
     i, j, k = @index(Global, NTuple)
-    @inbounds rhs[i, j, k] = Δxᶜᶜᶜ(i, j, k, grid) * divᶜᶜᶜ(i, j, k, grid, U★.u, U★.v, U★.w) / Δt
+    active = !inactive_cell(i, j, k, grid)
+    δ = divᶜᶜᶜ(i, j, k, grid, Ũ.u, Ũ.v, Ũ.w)
+    @inbounds rhs[i, j, k] = active * Δyᶜᶜᶜ(i, j, k, grid) * δ / Δt
 end
 
-@kernel function calculate_pressure_source_term_fourier_tridiagonal_solver!(rhs, grid, Δt, U★, ::YDirection)
+@kernel function _fourier_tridiagonal_source_term!(rhs, ::ZDirection, grid, Δt, Ũ)
     i, j, k = @index(Global, NTuple)
-    @inbounds rhs[i, j, k] = Δyᶜᶜᶜ(i, j, k, grid) * divᶜᶜᶜ(i, j, k, grid, U★.u, U★.v, U★.w) / Δt
+    active = !inactive_cell(i, j, k, grid)
+    δ = divᶜᶜᶜ(i, j, k, grid, Ũ.u, Ũ.v, Ũ.w)
+    @inbounds rhs[i, j, k] = active * Δzᶜᶜᶜ(i, j, k, grid) * δ / Δt
 end
 
-@kernel function calculate_pressure_source_term_fourier_tridiagonal_solver!(rhs, grid, Δt, U★, ::ZDirection)
-    i, j, k = @index(Global, NTuple)
-    @inbounds rhs[i, j, k] = Δzᶜᶜᶜ(i, j, k, grid) * divᶜᶜᶜ(i, j, k, grid, U★.u, U★.v, U★.w) / Δt
+function compute_source_term!(pressure, solver::DistributedFFTBasedPoissonSolver, Δt, Ũ)
+    rhs  = solver.storage.zfield
+    arch = architecture(solver)
+    grid = solver.local_grid
+    launch!(arch, grid, :xyz, _compute_source_term!, rhs, grid, Δt, Ũ)
+    return nothing        
 end
 
-#####
-##### Solve for pressure
-#####
-
-function solve_for_pressure!(pressure, solver::DistributedFFTBasedPoissonSolver, Δt, U★)
-    rhs = parent(first(solver.storage))
+function compute_source_term!(pressure, solver::DistributedFourierTridiagonalPoissonSolver, Δt, Ũ)
+    rhs = solver.storage.zfield
     arch = architecture(solver)
     grid = solver.local_grid
-
-    launch!(arch, grid, :xyz, calculate_permuted_pressure_source_term_fft_based_solver!,
-            rhs, grid, Δt, U★, solver.input_permutation,)
-
-    # Solve pressure Poisson equation for pressure, given rhs
-    solve!(pressure, solver)
-
-    return pressure
+    tdir = solver.batched_tridiagonal_solver.tridiagonal_direction
+    launch!(arch, grid, :xyz, _fourier_tridiagonal_source_term!, rhs, tdir, grid, Δt, Ũ)
+    return nothing
 end
 
-function solve_for_pressure!(pressure, solver::FFTBasedPoissonSolver, Δt, U★)
-
-    # Calculate right hand side:
-    rhs = solver.storage
+function compute_source_term!(pressure, solver::FourierTridiagonalPoissonSolver, Δt, Ũ)
+    rhs = solver.source_term
     arch = architecture(solver)
     grid = solver.grid
-
-    launch!(arch, grid, :xyz, calculate_pressure_source_term_fft_based_solver!,
-            rhs, grid, Δt, U★)
-
-    # Solve pressure Poisson given for pressure, given rhs
-    solve!(pressure, solver, rhs)
-
+    tdir = solver.batched_tridiagonal_solver.tridiagonal_direction
+    launch!(arch, grid, :xyz, _fourier_tridiagonal_source_term!, rhs, tdir, grid, Δt, Ũ)
     return nothing
 end
 
-function solve_for_pressure!(pressure, solver::FourierTridiagonalPoissonSolver, Δt, U★)
-
-    # Calculate right hand side:
-    rhs = solver.source_term
+function compute_source_term!(pressure, solver::FFTBasedPoissonSolver, Δt, Ũ)
+    rhs = solver.storage
     arch = architecture(solver)
     grid = solver.grid
+    launch!(arch, grid, :xyz, _compute_source_term!, rhs, grid, Δt, Ũ)
+    return nothing
+end
 
-    launch!(arch, grid, :xyz, calculate_pressure_source_term_fourier_tridiagonal_solver!,
-            rhs, grid, Δt, U★, solver.batched_tridiagonal_solver.tridiagonal_direction)
+#####
+##### Solve for pressure
+#####
 
-    # Pressure Poisson rhs, scaled by the spacing in the stretched direction at ᶜᶜᶜ, is stored in solver.source_term:
+function solve_for_pressure!(pressure, solver, Δt, Ũ)
+    compute_source_term!(pressure, solver, Δt, Ũ)
     solve!(pressure, solver)
+    return pressure
+end
 
-    return nothing
+function solve_for_pressure!(pressure, solver::ConjugateGradientPoissonSolver, Δt, Ũ)
+    rhs = solver.right_hand_side
+    grid = solver.grid
+    arch = architecture(grid)
+    launch!(arch, grid, :xyz, _compute_source_term!, rhs, grid, Δt, Ũ)
+    return solve!(pressure, solver.conjugate_gradient_solver, rhs)
 end
+
diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 0d02570020..a35e68e63b 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -42,11 +42,9 @@ update_hydrostatic_pressure!(::Nothing, arch, ::PCBIBG, args...; kw...) = nothin
     Nx, Ny, _ = size(grid)
     TX, TY, _ = topology(grid)
 
-    Sx = TX == Flat ? Nx : Nx + 2
-    Sy = TY == Flat ? Ny : Ny + 2
-
-    Ox = TX == Flat ? 0 : - 1 
-    Oy = TY == Flat ? 0 : - 1 
-
-    return KernelParameters((Sx, Sy), (Ox, Oy))
+    ii = ifelse(TX == Flat, 1:Nx, 0:Nx+1)
+    jj = ifelse(TY == Flat, 1:Ny, 0:Ny+1)
+        
+    return KernelParameters(ii, jj)
 end
+
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index b59a74eafe..671253be84 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -2,6 +2,7 @@ using Oceananigans: UpdateStateCallsite
 using Oceananigans.Architectures
 using Oceananigans.BoundaryConditions
 using Oceananigans.Biogeochemistry: update_biogeochemical_state!
+using Oceananigans.BoundaryConditions: update_boundary_condition!
 using Oceananigans.TurbulenceClosures: compute_diffusivities!
 using Oceananigans.Fields: compute!
 using Oceananigans.ImmersedBoundaries: mask_immersed_field!
@@ -26,8 +27,12 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
     # Update all FieldTimeSeries used in the model
     update_model_field_time_series!(model, model.clock)
 
+    # Update the boundary conditions
+    update_boundary_condition!(fields(model), model)
+
     # Fill halos for velocities and tracers
-    fill_halo_regions!(merge(model.velocities, model.tracers), model.clock, fields(model); async = true)
+    fill_halo_regions!(merge(model.velocities, model.tracers), model.clock, fields(model); 
+                       fill_boundary_normal_velocities = false, async = true)
 
     # Compute auxiliary fields
     for aux_field in model.auxiliary_fields
@@ -61,4 +66,4 @@ function compute_auxiliaries!(model::NonhydrostaticModel; p_parameters = tuple(p
         update_hydrostatic_pressure!(model; parameters = ppar)
     end
     return nothing
-end
+end
\ No newline at end of file
diff --git a/src/Models/ShallowWaterModels/set_shallow_water_model.jl b/src/Models/ShallowWaterModels/set_shallow_water_model.jl
index b68f279a04..b836106d12 100644
--- a/src/Models/ShallowWaterModels/set_shallow_water_model.jl
+++ b/src/Models/ShallowWaterModels/set_shallow_water_model.jl
@@ -14,7 +14,7 @@ function set!(model::ShallowWaterModel; kwargs...)
         set!(ϕ, value)
     end
 
-    update_state!(model)
+    update_state!(model; compute_tendencies = false)
     
     return nothing
 end
diff --git a/src/Models/ShallowWaterModels/shallow_water_model.jl b/src/Models/ShallowWaterModels/shallow_water_model.jl
index bd46367ba9..7e51210703 100644
--- a/src/Models/ShallowWaterModels/shallow_water_model.jl
+++ b/src/Models/ShallowWaterModels/shallow_water_model.jl
@@ -112,7 +112,7 @@ Keyword arguments
 function ShallowWaterModel(;
                            grid,
                            gravitational_acceleration,
-                               clock = Clock{eltype(grid), eltype(grid)}(0, Inf, 0, 1),
+                               clock = Clock{eltype(grid)}(time=0),
                   momentum_advection = UpwindBiasedFifthOrder(),
                     tracer_advection = WENO(),
                       mass_advection = WENO(),
@@ -205,7 +205,7 @@ function ShallowWaterModel(;
                               timestepper,
                               formulation)
 
-    update_state!(model)
+    update_state!(model; compute_tendencies = false)
 
     return model
 end
diff --git a/src/Models/interleave_communication_and_computation.jl b/src/Models/interleave_communication_and_computation.jl
index 413a4ef047..286097e826 100644
--- a/src/Models/interleave_communication_and_computation.jl
+++ b/src/Models/interleave_communication_and_computation.jl
@@ -38,13 +38,38 @@ function interior_tendency_kernel_parameters(grid, arch)
     Hx, Hy, _ = halo_size(grid)
     Tx, Ty, _ = topology(grid)
     Nx, Ny, Nz = size(grid)
-    
-    Sx = Rx == 1 ? Nx : (Tx == RightConnected || Tx == LeftConnected ? Nx - Hx : Nx - 2Hx)
-    Sy = Ry == 1 ? Ny : (Ty == RightConnected || Ty == LeftConnected ? Ny - Hy : Ny - 2Hy)
 
+    # Kernel parameters to compute the tendencies in all the interior if the direction is local (`R == 1`) and only in 
+    # the part of the domain that does not depend on the halo cells if the direction is partitioned. 
+    local_x = Rx == 1
+    local_y = Ry == 1
+    one_sided_x = Tx == RightConnected || Tx == LeftConnected
+    one_sided_y = Ty == RightConnected || Ty == LeftConnected 
+
+    # Sizes
+    Sx = if local_x
+        Nx
+    elseif one_sided_x
+        Nx - Hx
+    else # two sided
+        Nx - 2Hx
+    end
+
+    Sy = if local_y
+        Ny
+    elseif one_sided_y
+        Ny - Hy
+    else # two sided
+        Ny - 2Hy
+    end
+
+    # Offsets
     Ox = Rx == 1 || Tx == RightConnected ? 0 : Hx
     Oy = Ry == 1 || Ty == RightConnected ? 0 : Hy
+
+    sizes = (Sx, Sy, Nz)
+    offsets = (Ox, Oy, 0)
      
-    return KernelParameters((Sx, Sy, Nz), (Ox, Oy, 0))
+    return KernelParameters(sizes, offsets)
 end
 
diff --git a/src/Models/seawater_density.jl b/src/Models/seawater_density.jl
index d3073976d3..6dbfdefb69 100644
--- a/src/Models/seawater_density.jl
+++ b/src/Models/seawater_density.jl
@@ -63,7 +63,7 @@ julia> grid = RectilinearGrid(size=100, z=(-1000, 0), topology=(Flat, Flat, Boun
 1×1×100 RectilinearGrid{Float64, Flat, Flat, Bounded} on CPU with 0×0×3 halo
 ├── Flat x
 ├── Flat y
-└── Bounded  z ∈ [-1000.0, 0.0]   regularly spaced with Δz=10.0
+└── Bounded  z ∈ [-1000.0, 0.0] regularly spaced with Δz=10.0
 
 julia> tracers = (:T, :S)
 (:T, :S)
@@ -81,7 +81,7 @@ SeawaterBuoyancy{Float64}:
 julia> model = NonhydrostaticModel(; grid, buoyancy, tracers)
 NonhydrostaticModel{CPU, RectilinearGrid}(time = 0 seconds, iteration = 0)
 ├── grid: 1×1×100 RectilinearGrid{Float64, Flat, Flat, Bounded} on CPU with 0×0×3 halo
-├── timestepper: QuasiAdamsBashforth2TimeStepper
+├── timestepper: RungeKutta3TimeStepper
 ├── advection scheme: Centered reconstruction order 2
 ├── tracers: (T, S)
 ├── closure: Nothing
@@ -93,7 +93,7 @@ julia> set!(model, S = 34.7, T = 0.5)
 julia> density_operation = seawater_density(model)
 KernelFunctionOperation at (Center, Center, Center)
 ├── grid: 1×1×100 RectilinearGrid{Float64, Flat, Flat, Bounded} on CPU with 0×0×3 halo
-├── kernel_function: ρ (generic function with 2 methods)
+├── kernel_function: ρ (generic function with 3 methods)
 └── arguments: ("BoussinesqEquationOfState{Float64}", "1×1×100 Field{Center, Center, Center} on RectilinearGrid on CPU", "1×1×100 Field{Center, Center, Center} on RectilinearGrid on CPU", "KernelFunctionOperation at (Center, Center, Center)")
 
 julia> density_field = Field(density_operation)
@@ -114,7 +114,7 @@ julia> compute!(density_field)
 ├── operand: KernelFunctionOperation at (Center, Center, Center)
 ├── status: time=0.0
 └── data: 1×1×106 OffsetArray(::Array{Float64, 3}, 1:1, 1:1, -2:103) with eltype Float64 with indices 1:1×1:1×-2:103
-    └── max=1027.81, min=1027.71, mean=1027.76
+    └── max=1032.38, min=1027.73, mean=1030.06
 ```
 
 Values for `temperature`, `salinity` and `geopotential_height` can be passed to
diff --git a/src/MultiRegion/cubed_sphere_grid.jl b/src/MultiRegion/cubed_sphere_grid.jl
index 4bfb29d7fc..d84a19712f 100644
--- a/src/MultiRegion/cubed_sphere_grid.jl
+++ b/src/MultiRegion/cubed_sphere_grid.jl
@@ -23,6 +23,8 @@ const ConformalCubedSphereGrid{FT, TX, TY, TZ} = MultiRegionGrid{FT, TX, TY, TZ,
                              horizontal_topology = FullyConnected,
                              z_topology = Bounded,
                              radius = R_Earth,
+                             non_uniform_conformal_mapping = false,
+                             spacing_type = "geometric",
                              partition = CubedSpherePartition(; R = 1),
                              devices = nothing)
 
@@ -186,6 +188,8 @@ function ConformalCubedSphereGrid(arch::AbstractArchitecture=CPU(), FT=Float64;
                                   horizontal_topology = FullyConnected,
                                   z_topology = Bounded,
                                   radius = R_Earth,
+                                  non_uniform_conformal_mapping = false,
+                                  spacing_type = "geometric",
                                   partition = CubedSpherePartition(; R = 1),
                                   devices = nothing)
     Nx, Ny, _ = panel_size
@@ -233,7 +237,9 @@ function ConformalCubedSphereGrid(arch::AbstractArchitecture=CPU(), FT=Float64;
                                         radius,
                                         ξ = region_ξ,
                                         η = region_η,
-                                        rotation = region_rotation)
+                                        rotation = region_rotation,
+                                        non_uniform_conformal_mapping = non_uniform_conformal_mapping,
+                                        spacing_type = spacing_type)
 
     grid = MultiRegionGrid{FT, region_topology...}(CPU(),
                                                    partition,
@@ -263,59 +269,65 @@ end
 function fill_halo_regions!(grid::ConformalCubedSphereGrid{FT, TX, TY, TZ}) where {FT, TX, TY, TZ}
     Nx, Ny, Nz = size(grid)
 
-    fields = (:λᶜᶜᵃ,   :φᶜᶜᵃ,   :Azᶜᶜᵃ , :λᶠᶠᵃ, :φᶠᶠᵃ, :Azᶠᶠᵃ)
-    LXs    = (:Center, :Center, :Center, :Face, :Face, :Face )
-    LYs    = (:Center, :Center, :Center, :Face, :Face, :Face )
-
-    for (field, LX, LY) in zip(fields, LXs, LYs)
-        expr = quote
-            $(Symbol(field)) = Field{$(Symbol(LX)), $(Symbol(LY)), Nothing}($(grid))
-
-            for region in 1:number_of_regions($(grid))
-                getregion($(Symbol(field)), region).data .= getregion($(grid), region).$(Symbol(field))
-            end
-
-            if $(TX) == FullyConnected
-                fill_halo_regions!($(Symbol(field)))
-            end
+    λᶜᶜᵃ  = Field((Center, Center, Nothing), grid)
+    φᶜᶜᵃ  = Field((Center, Center, Nothing), grid)
+    Azᶜᶜᵃ = Field((Center, Center, Nothing), grid)
+    λᶠᶠᵃ  = Field((Face,   Face,   Nothing), grid)
+    φᶠᶠᵃ  = Field((Face,   Face,   Nothing), grid)
+    Azᶠᶠᵃ = Field((Face,   Face,   Nothing), grid)
+
+    for (field, name) in zip(( λᶜᶜᵃ, φᶜᶜᵃ,   Azᶜᶜᵃ,  λᶠᶠᵃ,  φᶠᶠᵃ,  Azᶠᶠᵃ),
+                             (:λᶜᶜᵃ, :φᶜᶜᵃ, :Azᶜᶜᵃ, :λᶠᶠᵃ, :φᶠᶠᵃ, :Azᶠᶠᵃ))
+        
+        for region in 1:number_of_regions(grid)
+            getregion(field, region).data .= getproperty(getregion(grid, region), name)
+        end
 
-            for region in 1:number_of_regions($(grid))
-                getregion($(grid), region).$(Symbol(field)) .= getregion($(Symbol(field)), region).data
-            end
-        end # quote
+        if TX == FullyConnected
+            fill_halo_regions!(field)
+        end
 
-        eval(expr)
+        for region in 1:number_of_regions(grid)
+            getproperty(getregion(grid, region), name) .= getregion(field, region).data
+        end
     end
 
-    fields₁ = (:Δxᶜᶜᵃ,  :Δxᶠᶜᵃ,  :Δyᶠᶜᵃ,  :λᶠᶜᵃ,   :φᶠᶜᵃ,   :Azᶠᶜᵃ , :Δxᶠᶠᵃ)
-    LXs₁    = (:Center, :Face,   :Face,   :Face,   :Face,   :Face  , :Face )
-    LYs₁    = (:Center, :Center, :Center, :Center, :Center, :Center, :Face )
-
-    fields₂ = (:Δyᶜᶜᵃ,  :Δyᶜᶠᵃ,  :Δxᶜᶠᵃ,  :λᶜᶠᵃ,   :φᶜᶠᵃ,   :Azᶜᶠᵃ , :Δyᶠᶠᵃ)
-    LXs₂    = (:Center, :Center, :Center, :Center, :Center, :Center, :Face )
-    LYs₂    = (:Center, :Face,   :Face,   :Face,   :Face,   :Face  , :Face )
-
-    for (field₁, LX₁, LY₁, field₂, LX₂, LY₂) in zip(fields₁, LXs₁, LYs₁, fields₂, LXs₂, LYs₂)
-        expr = quote
-            $(Symbol(field₁)) = Field{$(Symbol(LX₁)), $(Symbol(LY₁)), Nothing}($(grid))
-            $(Symbol(field₂)) = Field{$(Symbol(LX₂)), $(Symbol(LY₂)), Nothing}($(grid))
-
-            for region in 1:number_of_regions($(grid))
-                getregion($(Symbol(field₁)), region).data .= getregion($(grid), region).$(Symbol(field₁))
-                getregion($(Symbol(field₂)), region).data .= getregion($(grid), region).$(Symbol(field₂))
-            end
-
-            if $(TX) == FullyConnected
-                fill_halo_regions!(($(Symbol(field₁)), $(Symbol(field₂))); signed = false)
-            end
+    Δxᶜᶜᵃ = Field((Center, Center, Nothing), grid)
+    Δxᶠᶜᵃ = Field((Face,   Center, Nothing), grid)
+    Δyᶠᶜᵃ = Field((Face,   Center, Nothing), grid)
+    λᶠᶜᵃ  = Field((Face,   Center, Nothing), grid)
+    φᶠᶜᵃ  = Field((Face,   Center, Nothing), grid)
+    Azᶠᶜᵃ = Field((Face,   Center, Nothing), grid)
+    Δxᶠᶠᵃ = Field((Face,   Face,   Nothing), grid)
+    
+    fields₁ = ( Δxᶜᶜᵃ,   Δxᶠᶜᵃ,   Δyᶠᶜᵃ,   λᶠᶜᵃ,    φᶠᶜᵃ,    Azᶠᶜᵃ ,  Δxᶠᶠᵃ)
+    names₁  = (:Δxᶜᶜᵃ,  :Δxᶠᶜᵃ,  :Δyᶠᶜᵃ,  :λᶠᶜᵃ,   :φᶠᶜᵃ,   :Azᶠᶜᵃ , :Δxᶠᶠᵃ)
+
+    Δyᶜᶜᵃ = Field((Center, Center, Nothing), grid)
+    Δyᶜᶠᵃ = Field((Center, Face,   Nothing), grid)
+    Δxᶜᶠᵃ = Field((Center, Face,   Nothing), grid)
+    λᶜᶠᵃ  = Field((Center, Face,   Nothing), grid)
+    φᶜᶠᵃ  = Field((Center, Face,   Nothing), grid)
+    Azᶜᶠᵃ = Field((Center, Face,   Nothing), grid)
+    Δyᶠᶠᵃ = Field((Face,   Face,   Nothing), grid)
+
+    fields₂ = ( Δyᶜᶜᵃ,   Δyᶜᶠᵃ,   Δxᶜᶠᵃ,   λᶜᶠᵃ,    φᶜᶠᵃ,    Azᶜᶠᵃ ,  Δyᶠᶠᵃ)
+    names₂  = (:Δyᶜᶜᵃ,  :Δyᶜᶠᵃ,  :Δxᶜᶠᵃ,  :λᶜᶠᵃ,   :φᶜᶠᵃ,   :Azᶜᶠᵃ , :Δyᶠᶠᵃ)
+
+    for (field₁, field₂, name₁, name₂) in zip(fields₁, fields₂, names₁, names₂)
+        for region in 1:number_of_regions(grid)
+            getregion(field₁, region).data .= getproperty(getregion(grid, region), name₁)
+            getregion(field₂, region).data .= getproperty(getregion(grid, region), name₂)
+        end
 
-            for region in 1:number_of_regions($(grid))
-                getregion($(grid), region).$(Symbol(field₁)) .= getregion($(Symbol(field₁)), region).data
-                getregion($(grid), region).$(Symbol(field₂)) .= getregion($(Symbol(field₂)), region).data
-            end
-        end # quote
+        if TX == FullyConnected
+            fill_halo_regions!(field₁, field₂; signed = false)
+        end
 
-        eval(expr)
+        for region in 1:number_of_regions(grid)
+            getproperty(getregion(grid, region), name₁) .= getregion(field₁, region).data
+            getproperty(getregion(grid, region), name₂) .= getregion(field₂, region).data
+        end
     end
 
     ###################################################
diff --git a/src/MultiRegion/multi_region_abstract_operations.jl b/src/MultiRegion/multi_region_abstract_operations.jl
index 03d2fd9d8f..0cb14d90e6 100644
--- a/src/MultiRegion/multi_region_abstract_operations.jl
+++ b/src/MultiRegion/multi_region_abstract_operations.jl
@@ -42,4 +42,4 @@ end
 @inline _getregion(κ::KernelFunctionOperation{LX, LY, LZ}, r) where {LX, LY, LZ} = 
                 KernelFunctionOperation{LX, LY, LZ}(getregion(κ.kernel_function, r),
                                                     getregion(κ.grid, r), 
-                                                    getregion(κ.arguments, r)...)
\ No newline at end of file
+                                                    getregion(κ.arguments, r)...)
diff --git a/src/MultiRegion/multi_region_boundary_conditions.jl b/src/MultiRegion/multi_region_boundary_conditions.jl
index 764dc56b15..dc7b468eaa 100644
--- a/src/MultiRegion/multi_region_boundary_conditions.jl
+++ b/src/MultiRegion/multi_region_boundary_conditions.jl
@@ -12,7 +12,8 @@ using Oceananigans.BoundaryConditions:
             extract_north_bc, extract_top_bc, extract_bottom_bc,
             fill_halo_event!,
             MCBCT,
-            MCBC
+            MCBC,
+            fill_open_boundary_regions!
 
 import Oceananigans.Fields: tupled_fill_halo_regions!, boundary_conditions, data, fill_send_buffers!
 
@@ -98,7 +99,7 @@ function multi_region_permute_boundary_conditions(bcs)
     return (fill_halos!, boundary_conditions)
 end
 
-function fill_halo_regions!(c::MultiRegionObject, bcs, indices, loc, mrg::MultiRegionGrids, buffers, args...; kwargs...) 
+function fill_halo_regions!(c::MultiRegionObject, bcs, indices, loc, mrg::MultiRegionGrid, buffers, args...; fill_boundary_normal_velocities = true, kwargs...) 
     arch = architecture(mrg)
     @apply_regionally fill_halos!, bcs = multi_region_permute_boundary_conditions(bcs)
     
@@ -113,7 +114,13 @@ function fill_halo_regions!(c::MultiRegionObject, bcs, indices, loc, mrg::MultiR
             fill_halo = Val(fill_halo_side!)
             buff = get_buffers(fill_halo, buffers)
         end
-        buff = Reference(buff.regional_objects)
+        
+        buff = Reference(buffers.regional_objects)
+ 
+        if fill_boundary_normal_velocities
+            apply_regionally!(fill_open_boundary_regions!, c, bcs_side, indices, loc, mrg, args...) 
+        end
+
         apply_regionally!(fill_halo_event!, c, fill_halo_side!, bcs_side, 
                           indices, loc, arch, mrg, buff, 
                           args...; kwargs...)
diff --git a/src/MultiRegion/multi_region_field.jl b/src/MultiRegion/multi_region_field.jl
index 27b907708e..604f2fb4ab 100644
--- a/src/MultiRegion/multi_region_field.jl
+++ b/src/MultiRegion/multi_region_field.jl
@@ -6,6 +6,7 @@ using Oceananigans.OutputWriters: output_indices
 
 using Base: @propagate_inbounds
 
+import Oceananigans.DistributedComputations: reconstruct_global_field
 import Oceananigans.BoundaryConditions: FieldBoundaryConditions, regularize_field_boundary_conditions
 import Oceananigans.Grids: xnodes, ynodes
 import Oceananigans.Fields: set!, compute!, compute_at!, validate_field_data, validate_boundary_conditions
@@ -81,9 +82,6 @@ function reconstruct_global_field(mrf::MultiRegionField)
     return global_field
 end
 
-# Fallback!
-@inline reconstruct_global_field(f::AbstractField) = f
-
 function reconstruct_global_indices(indices, p::XPartition, N)
     idx1 = getregion(indices, 1)[1]
     idxl = getregion(indices, length(p))[1]
diff --git a/src/MultiRegion/multi_region_grid.jl b/src/MultiRegion/multi_region_grid.jl
index 20f8cdf787..0737306410 100644
--- a/src/MultiRegion/multi_region_grid.jl
+++ b/src/MultiRegion/multi_region_grid.jl
@@ -87,8 +87,8 @@ julia> using Oceananigans
 
 julia> grid = RectilinearGrid(size=(12, 12), extent=(1, 1), topology=(Bounded, Bounded, Flat))
 12×12×1 RectilinearGrid{Float64, Bounded, Bounded, Flat} on CPU with 3×3×0 halo
-├── Bounded  x ∈ [0.0, 1.0]       regularly spaced with Δx=0.0833333
-├── Bounded  y ∈ [0.0, 1.0]       regularly spaced with Δy=0.0833333
+├── Bounded  x ∈ [0.0, 1.0] regularly spaced with Δx=0.0833333
+├── Bounded  y ∈ [0.0, 1.0] regularly spaced with Δy=0.0833333
 └── Flat z
 
 julia> multi_region_grid = MultiRegionGrid(grid, partition = XPartition(4))
@@ -169,7 +169,7 @@ end
 
 Reconstruct the `mrg` global grid associated with the `MultiRegionGrid` on `architecture(mrg)`.
 """
-function reconstruct_global_grid(mrg)
+function reconstruct_global_grid(mrg::MultiRegionGrid)
     size   = reconstruct_size(mrg, mrg.partition)
     extent = reconstruct_extent(mrg, mrg.partition)
     topo   = topology(mrg)
@@ -203,8 +203,8 @@ Adapt an array `a` to be compatible with a `MultiRegionGrid`.
 function multi_region_object_from_array(a::AbstractArray, mrg::MultiRegionGrid)
     local_size = construct_regionally(size, mrg)
     arch = architecture(mrg)
-    a    = on_architecture(CPU(), a)
-    ma   = construct_regionally(partition_global_array, a, mrg.partition, local_size, Iterate(1:length(mrg)), arch)
+    a  = on_architecture(CPU(), a)
+    ma = construct_regionally(partition, a, mrg.partition, local_size, Iterate(1:length(mrg)), arch)
     return ma
 end
 
diff --git a/src/MultiRegion/multi_region_models.jl b/src/MultiRegion/multi_region_models.jl
index 5159a031ac..9d835342f7 100644
--- a/src/MultiRegion/multi_region_models.jl
+++ b/src/MultiRegion/multi_region_models.jl
@@ -5,9 +5,9 @@ using Oceananigans.TimeSteppers: AbstractTimeStepper, QuasiAdamsBashforth2TimeSt
 using Oceananigans.Models: PrescribedVelocityFields
 using Oceananigans.TurbulenceClosures: VerticallyImplicitTimeDiscretization
 using Oceananigans.Advection: AbstractAdvectionScheme
-using Oceananigans.Advection: VelocityUpwinding, OnlySelfUpwinding, CrossAndSelfUpwinding
+using Oceananigans.Advection: OnlySelfUpwinding, CrossAndSelfUpwinding
 using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom, GridFittedBoundary
-using Oceananigans.Solvers: PreconditionedConjugateGradientSolver
+using Oceananigans.Solvers: ConjugateGradientSolver
 
 import Oceananigans.Advection: WENO, cell_advection_timescale
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: build_implicit_step_solver, validate_tracer_advection
@@ -17,7 +17,7 @@ const MultiRegionModel = HydrostaticFreeSurfaceModel{<:Any, <:Any, <:AbstractArc
 
 # Utility to generate the inputs to complex `getregion`s
 function getregionalproperties(T, inner=true)
-    type = eval(T)
+    type = getglobal(@__MODULE__, T)
     names = fieldnames(type)
     args  = Vector(undef, length(names))
     for (n, name) in enumerate(names)
@@ -34,10 +34,9 @@ Types = (:HydrostaticFreeSurfaceModel,
          :SplitExplicitState,
          :SplitExplicitFreeSurface,
          :PrescribedVelocityFields,
-         :PreconditionedConjugateGradientSolver,
+         :ConjugateGradientSolver,
          :CrossAndSelfUpwinding,
          :OnlySelfUpwinding,
-         :VelocityUpwinding,
          :GridFittedBoundary,
          :GridFittedBottom,
          :PartialCellBottom)
diff --git a/src/MultiRegion/x_partitions.jl b/src/MultiRegion/x_partitions.jl
index 92fb3cf930..328de0d799 100644
--- a/src/MultiRegion/x_partitions.jl
+++ b/src/MultiRegion/x_partitions.jl
@@ -54,15 +54,15 @@ divide_direction(x::Tuple, p::XPartition) =
 divide_direction(x::AbstractArray, p::XPartition) =
     Tuple(x[1+sum(p.div[1:i-1]):1+sum(p.div[1:i])] for i in 1:length(p))
 
-partition_global_array(a::Function, args...) = a
-partition_global_array(a::Field, p::EqualXPartition, args...) = partition_global_array(a.data, p, args...)
+partition(a::Function, args...) = a
+partition(a::Field, p::EqualXPartition, args...) = partition(a.data, p, args...)
 
-function partition_global_array(a::AbstractArray, ::EqualXPartition, local_size, region, arch)
+function partition(a::AbstractArray, ::EqualXPartition, local_size, region, arch)
     idxs = default_indices(length(size(a)))
     return on_architecture(arch, a[local_size[1]*(region-1)+1:local_size[1]*region, idxs[2:end]...])
 end
 
-function partition_global_array(a::OffsetArray, ::EqualXPartition, local_size, region, arch) 
+function partition(a::OffsetArray, ::EqualXPartition, local_size, region, arch) 
     idxs    = default_indices(length(size(a)))
     offsets = (a.offsets[1], Tuple(0 for i in 1:length(idxs)-1)...)
     return on_architecture(arch, OffsetArray(a[local_size[1]*(region-1)+1+offsets[1]:local_size[1]*region-offsets[1], idxs[2:end]...], offsets...))
diff --git a/src/MultiRegion/y_partitions.jl b/src/MultiRegion/y_partitions.jl
index 06374a5a58..4780891568 100644
--- a/src/MultiRegion/y_partitions.jl
+++ b/src/MultiRegion/y_partitions.jl
@@ -50,15 +50,15 @@ function divide_direction(x::AbstractArray, p::EqualYPartition)
     return Tuple(x[1+(i-1)*nelem:1+i*nelem] for i in 1:length(p))
 end
 
-partition_global_array(a::Field, p::EqualYPartition, args...) = partition_global_array(a.data, p, args...)
+partition(a::Field, p::EqualYPartition, args...) = partition(a.data, p, args...)
 
-function partition_global_array(a::AbstractArray, ::EqualYPartition, local_size, region, arch) 
+function partition(a::AbstractArray, ::EqualYPartition, local_size, region, arch) 
     idxs = default_indices(length(size(a)))
     offsets = (a.offsets[1], Tuple(0 for i in 1:length(idxs)-1)...)
     return on_architecture(arch, OffsetArray(a[local_size[1]*(region-1)+1+offsets[1]:local_size[1]*region-offsets[1], idxs[2:end]...], offsets...))
 end
 
-function partition_global_array(a::OffsetArray, ::EqualYPartition, local_size, region, arch) 
+function partition(a::OffsetArray, ::EqualYPartition, local_size, region, arch) 
     idxs    = default_indices(length(size(a)))
     offsets = (0, a.offsets[2], Tuple(0 for i in 1:length(idxs)-2)...)
     return on_architecture(arch, OffsetArray(a[idxs[1], local_size[2]*(region-1)+1+offsets[2]:local_size[2]*region-offsets[2], idxs[3:end]...], offsets...))
diff --git a/src/Oceananigans.jl b/src/Oceananigans.jl
index b61fd69c3b..2abd97e7a0 100644
--- a/src/Oceananigans.jl
+++ b/src/Oceananigans.jl
@@ -19,6 +19,7 @@ export
     LatitudeLongitudeGrid,
     OrthogonalSphericalShellGrid,
     xnodes, ynodes, znodes, nodes,
+    λnodes, φnodes,
     xspacings, yspacings, zspacings,
     minimum_xspacing, minimum_yspacing, minimum_zspacing,
 
@@ -33,6 +34,7 @@ export
     UpwindBiased, UpwindBiasedFirstOrder, UpwindBiasedThirdOrder, UpwindBiasedFifthOrder, 
     WENO, WENOThirdOrder, WENOFifthOrder,
     VectorInvariant, WENOVectorInvariant, EnergyConserving, EnstrophyConserving,
+    TracerAdvection,
 
     # Boundary conditions
     BoundaryCondition,
@@ -142,7 +144,6 @@ import Base:
     iterate, similar, show,
     getindex, lastindex, setindex!,
     push!
-
     
 #####
 ##### Abstract types
@@ -188,6 +189,7 @@ function tupleit end
 function fields end
 function prognostic_fields end
 function tracer_tendency_kernel_function end
+function boundary_conditions end
 
 #####
 ##### Include all the submodules
@@ -211,7 +213,6 @@ include("DistributedComputations/DistributedComputations.jl")
 
 # TODO: move here
 #include("ImmersedBoundaries/ImmersedBoundaries.jl")
-#include("Distributed/Distributed.jl")
 #include("MultiRegion/MultiRegion.jl")
 
 # Physics, time-stepping, and models
@@ -224,8 +225,6 @@ include("Biogeochemistry.jl")
 
 # TODO: move above
 include("ImmersedBoundaries/ImmersedBoundaries.jl")
-# include("DistributedComputations/DistributedComputations.jl")
-
 include("Models/Models.jl")
 
 # Output and Physics, time-stepping, and models
diff --git a/src/Operators/Operators.jl b/src/Operators/Operators.jl
index 8f3721060f..570d43dd6b 100644
--- a/src/Operators/Operators.jl
+++ b/src/Operators/Operators.jl
@@ -60,6 +60,9 @@ export ℑxᶜᵃᵃ, ℑxᶠᵃᵃ, ℑyᵃᶜᵃ, ℑyᵃᶠᵃ, ℑzᵃᵃᶜ
 export ℑxyᶜᶜᵃ, ℑxyᶠᶜᵃ, ℑxyᶠᶠᵃ, ℑxyᶜᶠᵃ, ℑxzᶜᵃᶜ, ℑxzᶠᵃᶜ, ℑxzᶠᵃᶠ, ℑxzᶜᵃᶠ, ℑyzᵃᶜᶜ, ℑyzᵃᶠᶜ, ℑyzᵃᶠᶠ, ℑyzᵃᶜᶠ
 export ℑxyzᶜᶜᶠ, ℑxyzᶜᶠᶜ, ℑxyzᶠᶜᶜ, ℑxyzᶜᶠᶠ, ℑxyzᶠᶜᶠ, ℑxyzᶠᶠᶜ, ℑxyzᶜᶜᶜ, ℑxyzᶠᶠᶠ
 
+# Reference frame conversion
+export intrinsic_vector, extrinsic_vector
+
 using Oceananigans.Grids
 
 import Oceananigans.Grids: xspacing, yspacing, zspacing
@@ -86,4 +89,6 @@ include("divergence_operators.jl")
 include("vorticity_operators.jl")
 include("laplacian_operators.jl")
 
+include("vector_rotation_operators.jl")
+
 end # module
diff --git a/src/Operators/interpolation_utils.jl b/src/Operators/interpolation_utils.jl
index 2148d2570e..748eacf242 100644
--- a/src/Operators/interpolation_utils.jl
+++ b/src/Operators/interpolation_utils.jl
@@ -62,9 +62,9 @@ function interpolation_operator(from, to)
         global identity_counter += 1
         identity = identify_an_identity(identity_counter)
 
-        return @eval $identity
+        return getglobal(@__MODULE__, identity)
     else
-        return eval(Symbol(:ℑ, ℑxsym(x), ℑysym(y), ℑzsym(z), x, y, z))
+        return getglobal(@__MODULE__, Symbol(:ℑ, ℑxsym(x), ℑysym(y), ℑzsym(z), x, y, z))
     end
 end
 
@@ -77,7 +77,7 @@ operator for fields that have no intrinsic location, like numbers or functions.
 function interpolation_operator(::Nothing, to)
     global identity_counter += 1
     identity = identify_an_identity(identity_counter)
-    return @eval $identity
+    return getglobal(@__MODULE__, identity)
 end
 
 assumed_field_location(name) = name === :u  ? (Face, Center, Center) :
@@ -118,11 +118,11 @@ for LX in (:Center, :Face), LY in (:Center, :Face), LZ in (:Center, :Face)
         to   = (eval(IX), eval(IY), eval(IZ))
         interp_func = Symbol(interpolation_operator(from, to))
         @eval begin
-            ℑxyz(i, j, k, grid, from::F, to::T, c) where {F<:Tuple{<:$LX, <:$LY, <:$LZ}, T<:Tuple{<:$IX, <:$IY, <:$IZ}} = 
-                $interp_func(i, j, k, grid, c)
+            @inline ℑxyz(i, j, k, grid, from::F, to::T, c) where {F<:Tuple{<:$LX, <:$LY, <:$LZ}, T<:Tuple{<:$IX, <:$IY, <:$IZ}} = 
+                         $interp_func(i, j, k, grid, c)
          
-            ℑxyz(i, j, k, grid, from::F, to::T, f, args...) where {F<:Tuple{<:$LX, <:$LY, <:$LZ}, T<:Tuple{<:$IX, <:$IY, <:$IZ}} = 
-                $interp_func(i, j, k, grid, f, args...)
+            @inline ℑxyz(i, j, k, grid, from::F, to::T, f, args...) where {F<:Tuple{<:$LX, <:$LY, <:$LZ}, T<:Tuple{<:$IX, <:$IY, <:$IZ}} = 
+                         $interp_func(i, j, k, grid, f, args...)
         end
     end
 end
diff --git a/src/Operators/vector_rotation_operators.jl b/src/Operators/vector_rotation_operators.jl
new file mode 100644
index 0000000000..7ac9da502c
--- /dev/null
+++ b/src/Operators/vector_rotation_operators.jl
@@ -0,0 +1,135 @@
+# TODO: have a general Oceananigans-wide function that retrieves a pointwise
+# value for a function, an array, a number, a field etc?
+# This would be a generalization of `getbc` that could be used everywhere we need it
+@inline getvalue(::Nothing,        i, j, k, grid, args...) = nothing
+@inline getvalue(a::Number,        i, j, k, grid, args...) = a
+@inline getvalue(a::AbstractArray, i, j, k, grid, args...) = @inbounds a[i, j, k]
+
+"""
+    intrinsic_vector(i, j, k, grid::AbstractGrid, uₑ, vₑ, wₑ)
+
+Convert the three-dimensional vector with components `uₑ, vₑ, wₑ` defined in an _extrinsic_ 
+coordinate system associated with the domain, to the coordinate system _intrinsic_ to the grid.
+
+_extrinsic_ coordinate systems are:
+
+- Cartesian for any grid that discretizes a Cartesian domain (e.g. a `RectilinearGrid`)
+- Geographic coordinates for any grid that discretizes a Spherical domain (e.g. an `AbstractCurvilinearGrid`)
+
+Therefore, for the [`RectilinearGrid`](@ref) and the [`LatitudeLongitudeGrid`](@ref), the _extrinsic_ and the 
+_intrinsic_ coordinate system are equivalent. However, for other grids (e.g., for the
+ [`ConformalCubedSphereGrid`](@ref)) that might not be the case.
+"""
+@inline intrinsic_vector(i, j, k, grid::AbstractGrid, uₑ, vₑ, wₑ) = 
+    getvalue(uₑ, i, j, k, grid), getvalue(vₑ, i, j, k, grid), getvalue(wₑ, i, j, k, grid)
+
+"""
+    extrinsic_vector(i, j, k, grid::AbstractGrid, uᵢ, vᵢ, wᵢ)
+
+Convert the three-dimensional vector with components `uᵢ, vᵢ, wᵢ ` defined on the _intrinsic_ coordinate
+system of the grid, to the _extrinsic_ coordinate system associated with the domain.
+
+_extrinsic_ coordinate systems are:
+
+- Cartesian for any grid that discretizes a Cartesian domain (e.g. a `RectilinearGrid`)
+- Geographic coordinates for any grid that discretizes a Spherical domain (e.g. an `AbstractCurvilinearGrid`)
+
+Therefore, for the [`RectilinearGrid`](@ref) and the [`LatitudeLongitudeGrid`](@ref), the _extrinsic_ and the 
+_intrinsic_ coordinate systems are equivalent. However, for other grids (e.g., for the
+ [`ConformalCubedSphereGrid`](@ref)) that might not be the case.
+"""
+@inline extrinsic_vector(i, j, k, grid::AbstractGrid, uᵢ, vᵢ, wᵢ) =
+    getvalue(uᵢ, i, j, k, grid), getvalue(vᵢ, i, j, k, grid), getvalue(wᵢ, i, j, k, grid)
+
+# 2D vectors
+@inline intrinsic_vector(i, j, k, grid::AbstractGrid, uₑ, vₑ) = 
+    getvalue(uₑ, i, j, k, grid), getvalue(vₑ, i, j, k, grid)
+
+@inline extrinsic_vector(i, j, k, grid::AbstractGrid, uᵢ, vᵢ) = 
+    getvalue(uᵢ, i, j, k, grid), getvalue(vᵢ, i, j, k, grid)
+
+# Intrinsic and extrinsic conversion for `OrthogonalSphericalShellGrid`s,
+# i.e. curvilinear grids defined on a sphere which are locally orthogonal.
+# If the coordinates match with the coordinates of a latitude-longitude grid
+# (i.e. globally orthogonal), these functions collapse to 
+# uₑ, vₑ, wₑ = uᵢ, vᵢ, wᵢ
+
+# 2D vectors
+@inline function intrinsic_vector(i, j, k, grid::OrthogonalSphericalShellGrid, uₑ, vₑ)
+
+    φᶜᶠᵃ₊ = φnode(i, j+1, 1, grid, Center(), Face(), Center())
+    φᶜᶠᵃ₋ = φnode(i,   j, 1, grid, Center(), Face(), Center())
+    Δyᶜᶜᵃ = Δyᶜᶜᶜ(i,   j, 1, grid)
+
+    # θᵢ is the rotation angle between intrinsic and extrinsic reference frame
+    Rcosθᵢ = deg2rad(φᶜᶠᵃ₊ - φᶜᶠᵃ₋) / Δyᶜᶜᵃ
+
+    φᶠᶜᵃ₊ = φnode(i+1, j, 1, grid, Face(), Center(), Center())
+    φᶠᶜᵃ₋ = φnode(i,   j, 1, grid, Face(), Center(), Center())
+    Δxᶜᶜᵃ = Δxᶜᶜᶜ(i,   j, 1, grid)
+
+    Rsinθᵢ = - deg2rad(φᶠᶜᵃ₊ - φᶠᶜᵃ₋) / Δxᶜᶜᵃ
+
+    # Normalization for the rotation angles
+    Rᵢ = sqrt(Rcosθᵢ^2 + Rsinθᵢ^2)
+
+    u  = getvalue(uₑ, i, j, k, grid)
+    v  = getvalue(vₑ, i, j, k, grid)
+
+    cosθᵢ = Rcosθᵢ / Rᵢ
+    sinθᵢ = Rsinθᵢ / Rᵢ
+
+    uᵢ =   u * cosθᵢ + v * sinθᵢ
+    vᵢ = - u * sinθᵢ + v * cosθᵢ
+
+    return uᵢ, vᵢ
+end
+
+# 3D vectors
+@inline function intrinsic_vector(i, j, k, grid::OrthogonalSphericalShellGrid, uₑ, vₑ, wₑ)
+
+    uᵢ, vᵢ = intrinsic_vector(i, j, k, grid, uₑ, vₑ)
+    wᵢ = getvalue(wₑ, i, j, k, grid)
+
+    return uᵢ, vᵢ, wᵢ
+end
+
+# 2D vectors
+@inline function extrinsic_vector(i, j, k, grid::OrthogonalSphericalShellGrid, uᵢ, vᵢ)
+
+    φᶜᶠᵃ₊ = φnode(i, j+1, 1, grid, Center(), Face(), Center())
+    φᶜᶠᵃ₋ = φnode(i,   j, 1, grid, Center(), Face(), Center())
+    Δyᶜᶜᵃ = Δyᶜᶜᶜ(i,   j, 1, grid)
+
+    # θₑ is the rotation angle between intrinsic and extrinsic reference frame
+    Rcosθₑ = deg2rad(φᶜᶠᵃ₊ - φᶜᶠᵃ₋) / Δyᶜᶜᵃ
+
+    φᶠᶜᵃ₊ = φnode(i+1, j, 1, grid, Face(), Center(), Center())
+    φᶠᶜᵃ₋ = φnode(i,   j, 1, grid, Face(), Center(), Center())
+    Δxᶜᶜᵃ = Δxᶜᶜᶜ(i,   j, 1, grid)
+
+    Rsinθₑ = - deg2rad(φᶠᶜᵃ₊ - φᶠᶜᵃ₋) / Δxᶜᶜᵃ
+
+    # Normalization for the rotation angles
+    Rₑ = sqrt(Rcosθₑ^2 + Rsinθₑ^2)
+
+    u  = getvalue(uᵢ, i, j, k, grid)
+    v  = getvalue(vᵢ, i, j, k, grid)
+
+    cosθₑ = Rcosθₑ / Rₑ
+    sinθₑ = Rsinθₑ / Rₑ
+
+    uₑ = u * cosθₑ - v * sinθₑ
+    vₑ = u * sinθₑ + v * cosθₑ
+
+    return uₑ, vₑ
+end
+
+# 3D vectors
+@inline function extrinsic_vector(i, j, k, grid::OrthogonalSphericalShellGrid, uᵢ, vᵢ, wᵢ)
+
+    uₑ, vₑ = intrinsic_vector(i, j, k, grid, uᵢ, vᵢ)
+    wₑ = getvalue(wᵢ, i, j, k, grid)
+
+    return uₑ, vₑ, wₑ
+end
\ No newline at end of file
diff --git a/src/OutputReaders/field_dataset.jl b/src/OutputReaders/field_dataset.jl
index 8408277349..dc6072eb6d 100644
--- a/src/OutputReaders/field_dataset.jl
+++ b/src/OutputReaders/field_dataset.jl
@@ -1,5 +1,5 @@
 struct FieldDataset{F, M, P}
-    fields :: F
+      fields :: F
     metadata :: M
     filepath :: P
 end
@@ -48,6 +48,25 @@ function FieldDataset(filepath;
 end
 
 Base.getindex(fds::FieldDataset, inds...) = Base.getindex(fds.fields, inds...)
+Base.getindex(fds::FieldDataset, i::Symbol) = Base.getindex(fds, string(i))
 
-Base.show(io::IO, fds::FieldDataset) =
-  print(io, "FieldDataset with $(length(fds.fields)) fields and $(length(fds.metadata)) metadata entries.")
+function Base.getproperty(fds::FieldDataset, name::Symbol)
+    if name in propertynames(fds)
+        return getfield(fds, name)
+    else
+        return getindex(fds, name)
+    end
+end
+
+function Base.show(io::IO, fds::FieldDataset)
+    s = "FieldDataset with $(length(fds.fields)) fields and $(length(fds.metadata)) metadata entries:\n"
+
+    n_fields = length(fds.fields)
+
+    for (i, (name, fts)) in enumerate(pairs(fds.fields))
+        prefix = i == n_fields ? "└── " : "├── "
+        s *= prefix * "$name: " * summary(fts) * '\n'
+    end
+
+    return print(io, s)
+end
diff --git a/src/OutputReaders/field_time_series.jl b/src/OutputReaders/field_time_series.jl
index 82ba2689bf..3177bc6283 100644
--- a/src/OutputReaders/field_time_series.jl
+++ b/src/OutputReaders/field_time_series.jl
@@ -4,6 +4,7 @@ using OffsetArrays
 using Statistics
 using JLD2
 using Adapt
+using CUDA: @allowscalar
 
 using Dates: AbstractTime
 using KernelAbstractions: @kernel, @index
@@ -251,8 +252,8 @@ mutable struct FieldTimeSeries{LX, LY, LZ, TI, K, I, D, G, ET, B, χ, P, N} <: A
         end
         
         if time_indexing isa Cyclical{Nothing} # we have to infer the period
-            Δt = times[end] - times[end-1]
-            period = times[end] - times[1] + Δt
+            Δt = @allowscalar times[end] - times[end-1]
+            period = @allowscalar times[end] - times[1] + Δt
             time_indexing = Cyclical(period)
         end
 
@@ -268,15 +269,15 @@ mutable struct FieldTimeSeries{LX, LY, LZ, TI, K, I, D, G, ET, B, χ, P, N} <: A
 end
 
 on_architecture(to, fts::FieldTimeSeries{LX, LY, LZ}) where {LX, LY, LZ} = 
-    FieldTimeSeries{LX, LY, LZ}(on_architecture(to, data),
-                                on_architecture(to, grid),
-                                on_architecture(to, backend),
-                                on_architecture(to, bcs),
-                                on_architecture(to, indices), 
-                                on_architecture(to, times),
-                                on_architecture(to, path),
-                                on_architecture(to, name),
-                                on_architecture(to, time_indexing))
+    FieldTimeSeries{LX, LY, LZ}(on_architecture(to, fts.data),
+                                on_architecture(to, fts.grid),
+                                on_architecture(to, fts.backend),
+                                on_architecture(to, fts.bcs),
+                                on_architecture(to, fts.indices), 
+                                on_architecture(to, fts.times),
+                                on_architecture(to, fts.path),
+                                on_architecture(to, fts.name),
+                                on_architecture(to, fts.time_indexing))
 
 #####
 ##### Minimal implementation of FieldTimeSeries for use in GPU kernels
@@ -284,7 +285,7 @@ on_architecture(to, fts::FieldTimeSeries{LX, LY, LZ}) where {LX, LY, LZ} =
 ##### Supports reduced locations + time-interpolation / extrapolation
 #####
 
-struct GPUAdaptedFieldTimeSeries{LX, LY, LZ, TI, K, ET, D, χ} <: AbstractArray{ET, 4}
+struct GPUAdaptedFieldTimeSeries{LX, LY, LZ, TI, K, ET, D, χ} <: AbstractField{LX, LY, LZ, Nothing, ET, 4}
              data :: D
             times :: χ
           backend :: K
diff --git a/src/OutputReaders/field_time_series_indexing.jl b/src/OutputReaders/field_time_series_indexing.jl
index faf7bdff82..a9b89bb9df 100644
--- a/src/OutputReaders/field_time_series_indexing.jl
+++ b/src/OutputReaders/field_time_series_indexing.jl
@@ -1,5 +1,8 @@
+using Oceananigans.Grids: _node
+using Oceananigans.Fields: interpolator, _interpolate, fractional_indices, flatten_node
+using Oceananigans.Architectures: architecture
+
 import Oceananigans.Fields: interpolate
-using Oceananigans.Fields: interpolator, _interpolate, fractional_indices
 
 #####
 ##### Computation of time indices for interpolation
@@ -133,7 +136,7 @@ end
 # Linear time interpolation
 function Base.getindex(fts::FieldTimeSeries, time_index::Time)
     # Calculate fractional index (0 ≤ ñ ≤ 1)
-    ñ, n₁, n₂ = interpolating_time_indices(fts.time_indexing, fts.times, time_index.time)
+    ñ, n₁, n₂ = cpu_interpolating_time_indices(architecture(fts), fts.times, fts.time_indexing, time_index.time)
 
     if n₁ == n₂ # no interpolation needed
         return fts[n₁]
@@ -152,27 +155,28 @@ end
 ##### Linear time- and space-interpolation of a FTS
 #####
 
-@inline function interpolate(at_node, at_time_index::Time, from_fts::FlavorOfFTS, from_loc, from_grid)
+@inline function interpolate(to_node, to_time_index::Time, from_fts::FlavorOfFTS, from_loc, from_grid)
     data = from_fts.data
     times = from_fts.times
     backend = from_fts.backend
     time_indexing = from_fts.time_indexing
-    return interpolate(at_node, at_time_index, data, from_loc, from_grid, times, backend, time_indexing)
+    return interpolate(to_node, to_time_index, data, from_loc, from_grid, times, backend, time_indexing)
 end
 
-@inline function interpolate(at_node, at_time_index::Time, data::OffsetArray,
+@inline function interpolate(to_node, to_time_index::Time, data::OffsetArray,
                              from_loc, from_grid, times, backend, time_indexing)
 
-    at_time = at_time_index.time
+    to_time = to_time_index.time
 
     # Build space interpolators
-    ii, jj, kk = fractional_indices(at_node, from_grid, from_loc...)
+    to_node = flatten_node(to_node...)
+    ii, jj, kk = fractional_indices(to_node, from_grid, from_loc...)
 
     ix = interpolator(ii)
     iy = interpolator(jj)
     iz = interpolator(kk)
 
-    ñ, n₁, n₂ = interpolating_time_indices(time_indexing, times, at_time)
+    ñ, n₁, n₂ = interpolating_time_indices(time_indexing, times, to_time)
 
     Nt = length(times)
     m₁ = memory_index(backend, time_indexing, Nt, n₁)
@@ -198,11 +202,9 @@ function interpolate!(target_fts::FieldTimeSeries, source_fts::FieldTimeSeries)
     source_location = map(instantiate, location(source_fts))
     target_location = map(instantiate, location(target_fts))
 
-    target_times = map(Time, target_fts.times)
-
     launch!(arch, target_grid, size(target_fts),
             _interpolate_field_time_series!,
-            target_fts.data, target_grid, target_location, target_times,
+            target_fts.data, target_grid, target_location, target_fts.times,
             source_fts, source_grid, source_location)
 
     fill_halo_regions!(target_fts)
@@ -216,10 +218,10 @@ end
     # 4D index, cool!
     i, j, k, n = @index(Global, NTuple)
 
-    target_node = node(i, j, k, target_grid, target_location...)
-    at_time     = @inbounds target_times[n]
+    target_node = _node(i, j, k, target_grid, target_location...)
+    to_time     = @inbounds Time(target_times[n])
 
-    @inbounds target_fts[i, j, k, n] = interpolate(target_node, at_time,
+    @inbounds target_fts[i, j, k, n] = interpolate(target_node, to_time,
                                                    source_fts, source_location, source_grid)
 end
 
@@ -227,6 +229,18 @@ end
 ##### FieldTimeSeries updating
 #####
 
+# Let's make sure `times` is available on the CPU. This is always the case 
+# for ranges. if `times` is a vector that resides on the GPU, it has to be moved to the CPU for safe indexing. 
+# TODO: Copying the whole array is a bit unclean, maybe find a way that avoids the penalty of allocating and copying memory.
+# This would require refactoring `FieldTimeSeries` to include a cpu-allocated times array
+cpu_interpolating_time_indices(::CPU, times, time_indexing, t, arch) = interpolating_time_indices(time_indexing, times, t)
+cpu_interpolating_time_indices(::CPU, times::AbstractVector, time_indexing, t) = interpolating_time_indices(time_indexing, times, t)
+
+function cpu_interpolating_time_indices(::GPU, times::AbstractVector, time_indexing, t) 
+    cpu_times = on_architecture(CPU(), times)
+    return interpolating_time_indices(time_indexing, cpu_times, t)
+end
+
 # Fallbacks that do nothing
 update_field_time_series!(fts, time::Time) = nothing
 update_field_time_series!(fts, n::Int) = nothing
@@ -235,7 +249,7 @@ update_field_time_series!(fts, n::Int) = nothing
 # Linear extrapolation, simple version
 function update_field_time_series!(fts::PartlyInMemoryFTS, time_index::Time)
     t = time_index.time
-    ñ, n₁, n₂ = interpolating_time_indices(fts.time_indexing, fts.times, t)
+    ñ, n₁, n₂ = cpu_interpolating_time_indices(architecture(fts), fts.times, fts.time_indexing, t)
     return update_field_time_series!(fts, n₁, n₂)
 end
 
diff --git a/src/OutputReaders/set_field_time_series.jl b/src/OutputReaders/set_field_time_series.jl
index f62da61106..d450926b3e 100644
--- a/src/OutputReaders/set_field_time_series.jl
+++ b/src/OutputReaders/set_field_time_series.jl
@@ -1,4 +1,5 @@
 using Printf
+using Oceananigans.Architectures: cpu_architecture
 
 #####
 ##### set!
@@ -15,6 +16,8 @@ function set!(fts::InMemoryFTS, path::String=fts.path, name::String=fts.name)
     file_times = [file["timeseries/t/$i"] for i in file_iterations]
     close(file)
 
+    arch = architecture(fts)
+
     # TODO: a potential optimization here might be to load
     # all of the data into a single array, and then transfer that
     # to parent(fts).
@@ -33,7 +36,7 @@ function set!(fts::InMemoryFTS, path::String=fts.path, name::String=fts.name)
         
         # Note: use the CPU for this step
         field_n = Field(location(fts), path, name, file_iter,
-                        architecture = CPU(),
+                        architecture = cpu_architecture(arch),
                         indices = fts.indices,
                         boundary_conditions = fts.boundary_conditions)
 
diff --git a/src/OutputWriters/OutputWriters.jl b/src/OutputWriters/OutputWriters.jl
index d9cbe64953..d1d12cb23f 100644
--- a/src/OutputWriters/OutputWriters.jl
+++ b/src/OutputWriters/OutputWriters.jl
@@ -21,6 +21,9 @@ using OffsetArrays
 
 import Oceananigans: write_output!, initialize!
 
+const c = Center()
+const f = Face()
+
 Base.open(ow::AbstractOutputWriter) = nothing
 Base.close(ow::AbstractOutputWriter) = nothing
 
diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index 2bad4db170..3f84773382 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -231,6 +231,7 @@ function set!(model, filepath::AbstractString)
         # Update model clock
         model.clock.iteration = checkpointed_clock.iteration
         model.clock.time = checkpointed_clock.time
+        model.clock.last_Δt = checkpointed_clock.last_Δt
     end
 
     return nothing
@@ -265,8 +266,6 @@ end
 set_time_stepper!(timestepper::RungeKutta3TimeStepper, arch, file, model_fields) =
     set_time_stepper_tendencies!(timestepper, arch, file, model_fields)
 
-function set_time_stepper!(timestepper::QuasiAdamsBashforth2TimeStepper, arch, file, model_fields)
+set_time_stepper!(timestepper::QuasiAdamsBashforth2TimeStepper, arch, file, model_fields) =
     set_time_stepper_tendencies!(timestepper, arch, file, model_fields)
-    timestepper.previous_Δt = file["timestepper/previous_Δt"]
-    return nothing
-end
+
diff --git a/src/OutputWriters/jld2_output_writer.jl b/src/OutputWriters/jld2_output_writer.jl
index 9034f8db50..50a0a31937 100644
--- a/src/OutputWriters/jld2_output_writer.jl
+++ b/src/OutputWriters/jld2_output_writer.jl
@@ -112,11 +112,11 @@ Example
 
 Write out 3D fields for ``u``, ``v``, ``w``, and a tracer ``c``, along with a horizontal average:
 
-```jldoctest jld2_output_writer
+```
 using Oceananigans
 using Oceananigans.Utils: hour, minute
 
-model = NonhydrostaticModel(grid=RectilinearGrid(size=(1, 1, 1), extent=(1, 1, 1)), tracers=(:c,))
+model = NonhydrostaticModel(grid=RectilinearGrid(size=(1, 1, 1), extent=(1, 1, 1)), tracers=:c)
 simulation = Simulation(model, Δt=12, stop_time=1hour)
 
 function init_save_some_metadata!(file, model)
@@ -141,13 +141,13 @@ JLD2OutputWriter scheduled on TimeInterval(20 minutes):
 ├── array type: Array{Float64}
 ├── including: [:grid, :coriolis, :buoyancy, :closure]
 ├── file_splitting: NoFileSplitting
-└── file size: 27.2 KiB
+└── file size: 28.0 KiB
 ```
 
 and a time- and horizontal-average of tracer ``c`` every 20 minutes of simulation time
 to a file called `some_averaged_data.jld2`
 
-```jldoctest jld2_output_writer
+```
 simulation.output_writers[:avg_c] = JLD2OutputWriter(model, (; c=c_avg),
                                                      filename = "some_averaged_data.jld2",
                                                      schedule = AveragedTimeInterval(20minute, window=5minute))
@@ -159,7 +159,7 @@ JLD2OutputWriter scheduled on TimeInterval(20 minutes):
 ├── array type: Array{Float64}
 ├── including: [:grid, :coriolis, :buoyancy, :closure]
 ├── file_splitting: NoFileSplitting
-└── file size: 17.3 KiB
+└── file size: 17.8 KiB
 ```
 """
 function JLD2OutputWriter(model, outputs; filename, schedule,
diff --git a/src/OutputWriters/netcdf_output_writer.jl b/src/OutputWriters/netcdf_output_writer.jl
index 6b83fd6e09..6c3ff9eafe 100644
--- a/src/OutputWriters/netcdf_output_writer.jl
+++ b/src/OutputWriters/netcdf_output_writer.jl
@@ -49,32 +49,70 @@ zdim(::Nothing) = tuple()
 netcdf_spatial_dimensions(::AbstractField{LX, LY, LZ}) where {LX, LY, LZ} =
     tuple(xdim(instantiate(LX))..., ydim(instantiate(LY))..., zdim(instantiate(LZ))...)
 
-native_dimensions_for_netcdf_output(grid, indices, TX, TY, TZ, Hx, Hy, Hz) =
-    Dict("xC" => parent(xnodes(grid, Center(); with_halos=true))[parent_index_range(indices["xC"][1], Center(), TX(), Hx)],
-         "xF" => parent(xnodes(grid, Face();   with_halos=true))[parent_index_range(indices["xF"][1],   Face(), TX(), Hx)],
-         "yC" => parent(ynodes(grid, Center(); with_halos=true))[parent_index_range(indices["yC"][2], Center(), TY(), Hy)],
-         "yF" => parent(ynodes(grid, Face();   with_halos=true))[parent_index_range(indices["yF"][2],   Face(), TY(), Hy)],
-         "zC" => parent(znodes(grid, Center(); with_halos=true))[parent_index_range(indices["zC"][3], Center(), TZ(), Hz)],
-         "zF" => parent(znodes(grid, Face();   with_halos=true))[parent_index_range(indices["zF"][3],   Face(), TZ(), Hz)])
-
-native_dimensions_for_netcdf_output(grid::AbstractCurvilinearGrid, indices, TX, TY, TZ, Hx, Hy, Hz) =
-    Dict("xC" => parent(λnodes(grid, Center(); with_halos=true))[parent_index_range(indices["xC"][1], Center(), TX(), Hx)],
-         "xF" => parent(λnodes(grid, Face();   with_halos=true))[parent_index_range(indices["xF"][1],   Face(), TX(), Hx)],
-         "yC" => parent(φnodes(grid, Center(); with_halos=true))[parent_index_range(indices["yC"][2], Center(), TY(), Hy)],
-         "yF" => parent(φnodes(grid, Face();   with_halos=true))[parent_index_range(indices["yF"][2],   Face(), TY(), Hy)],
-         "zC" => parent(znodes(grid, Center(); with_halos=true))[parent_index_range(indices["zC"][3], Center(), TZ(), Hz)],
-         "zF" => parent(znodes(grid, Face();   with_halos=true))[parent_index_range(indices["zF"][3],   Face(), TZ(), Hz)])
+function native_dimensions_for_netcdf_output(grid, indices, TX, TY, TZ, Hx, Hy, Hz)
+    with_halos = true
+
+    xC = xnodes(grid, c; with_halos)
+    xF = xnodes(grid, f; with_halos)
+    yC = ynodes(grid, c; with_halos)
+    yF = ynodes(grid, f; with_halos)
+    zC = znodes(grid, c; with_halos)
+    zF = znodes(grid, f; with_halos)
+
+    xC = isnothing(xC) ? [0.0] : parent(xC)  
+    xF = isnothing(xF) ? [0.0] : parent(xF) 
+    yC = isnothing(yC) ? [0.0] : parent(yC) 
+    yF = isnothing(yF) ? [0.0] : parent(yF) 
+    zC = isnothing(zC) ? [0.0] : parent(zC) 
+    zF = isnothing(zF) ? [0.0] : parent(zF) 
+
+    dims = Dict("xC" => xC[parent_index_range(indices["xC"][1], c, TX(), Hx)],
+                "xF" => xF[parent_index_range(indices["xF"][1], f, TX(), Hx)],
+                "yC" => yC[parent_index_range(indices["yC"][2], c, TY(), Hy)],
+                "yF" => yF[parent_index_range(indices["yF"][2], f, TY(), Hy)],
+                "zC" => zC[parent_index_range(indices["zC"][3], c, TZ(), Hz)],
+                "zF" => zF[parent_index_range(indices["zF"][3], f, TZ(), Hz)])
+
+    return dims
+end
+
+function native_dimensions_for_netcdf_output(grid::AbstractCurvilinearGrid, indices, TX, TY, TZ, Hx, Hy, Hz)
+    with_halos = true
+
+    xC = λnodes(grid, c; with_halos)
+    xF = λnodes(grid, f; with_halos)
+    yC = φnodes(grid, c; with_halos)
+    yF = φnodes(grid, f; with_halos)
+    zC = znodes(grid, c; with_halos)
+    zF = znodes(grid, f; with_halos)
+
+    xC = isnothing(xC) ? [0.0] : parent(xC)  
+    xF = isnothing(xF) ? [0.0] : parent(xF) 
+    yC = isnothing(yC) ? [0.0] : parent(yC) 
+    yF = isnothing(yF) ? [0.0] : parent(yF) 
+    zC = isnothing(zC) ? [0.0] : parent(zC) 
+    zF = isnothing(zF) ? [0.0] : parent(zF) 
+
+    dims = Dict("xC" => xC[parent_index_range(indices["xC"][1], c, TX(), Hx)],
+                "xF" => xF[parent_index_range(indices["xF"][1], f, TX(), Hx)],
+                "yC" => yC[parent_index_range(indices["yC"][2], c, TY(), Hy)],
+                "yF" => yF[parent_index_range(indices["yF"][2], f, TY(), Hy)],
+                "zC" => zC[parent_index_range(indices["zC"][3], c, TZ(), Hz)],
+                "zF" => zF[parent_index_range(indices["zF"][3], f, TZ(), Hz)])
+
+    return dims
+end
 
 function default_dimensions(output, grid, indices, with_halos)
     Hx, Hy, Hz = halo_size(grid)
     TX, TY, TZ = topo = topology(grid)
 
-    locs = Dict("xC" => (Center(), Center(), Center()),
-                "xF" => (Face(),   Center(), Center()),
-                "yC" => (Center(), Center(), Center()),
-                "yF" => (Center(), Face(),   Center()),
-                "zC" => (Center(), Center(), Center()),
-                "zF" => (Center(), Center(), Face()  ))
+    locs = Dict("xC" => (c, c, c),
+                "xF" => (f, c, c),
+                "yC" => (c, c, c),
+                "yF" => (c, f, c),
+                "zC" => (c, c, c),
+                "zF" => (c, c, f))
 
     topo = map(instantiate, topology(grid))
 
@@ -275,7 +313,7 @@ NetCDFOutputWriter scheduled on TimeInterval(1 minute):
 ├── 2 outputs: (c, u)
 └── array type: Array{Float64}
 ├── file_splitting: NoFileSplitting
-└── file size: 14.8 KiB
+└── file size: 14.9 KiB
 ```
 
 ```jldoctest netcdf1
@@ -290,7 +328,7 @@ NetCDFOutputWriter scheduled on TimeInterval(1 minute):
 ├── 2 outputs: (c, u)
 └── array type: Array{Float64}
 ├── file_splitting: NoFileSplitting
-└── file size: 14.8 KiB
+└── file size: 14.9 KiB
 ```
 
 ```jldoctest netcdf1
@@ -320,19 +358,18 @@ Nx, Ny, Nz = 16, 16, 16
 
 grid = RectilinearGrid(size=(Nx, Ny, Nz), extent=(1, 2, 3))
 
-model = NonhydrostaticModel(grid=grid)
+model = NonhydrostaticModel(; grid)
 
 simulation = Simulation(model, Δt=1.25, stop_iteration=3)
 
-f(model) = model.clock.time^2; # scalar output
+f(model) = model.clock.time^2 # scalar output
 
-g(model) = model.clock.time .* exp.(znodes(grid, Center())) # vector/profile output
+zC = znodes(grid, Center())
+g(model) = model.clock.time .* exp.(zC) # vector/profile output
 
 xC, yF = xnodes(grid, Center()), ynodes(grid, Face())
-
 XC = [xC[i] for i in 1:Nx, j in 1:Ny]
 YF = [yF[j] for i in 1:Nx, j in 1:Ny]
-
 h(model) = @. model.clock.time * sin(XC) * cos(YF) # xy slice output
 
 outputs = Dict("scalar" => f, "profile" => g, "slice" => h)
@@ -391,7 +428,7 @@ NetCDFOutputWriter scheduled on IterationInterval(1):
 ├── 1 outputs: u
 └── array type: Array{Float64}
 ├── file_splitting: NoFileSplitting
-└── file size: 14.5 KiB
+└── file size: 14.6 KiB
 ```
 """
 function NetCDFOutputWriter(model, outputs;
diff --git a/src/OutputWriters/output_writer_utils.jl b/src/OutputWriters/output_writer_utils.jl
index 30e3cead20..af6c6f6834 100644
--- a/src/OutputWriters/output_writer_utils.jl
+++ b/src/OutputWriters/output_writer_utils.jl
@@ -160,7 +160,6 @@ end
 function serializeproperty!(file, address, ts::QuasiAdamsBashforth2TimeStepper)
     serializeproperty!(file, address * "/Gⁿ", ts.Gⁿ)
     serializeproperty!(file, address * "/G⁻", ts.G⁻)
-    serializeproperty!(file, address * "/previous_Δt", ts.previous_Δt)
     return nothing
 end
 
@@ -216,7 +215,9 @@ show_array_type(a::Type{Array{T}}) where T = "Array{$T}"
 If `filename` ends in `ext`, return `filename`. Otherwise return `filename * ext`.
 """
 function auto_extension(filename, ext) 
-    Next = length(ext)
-    filename[end-Next+1:end] == ext || (filename *= ext)
-    return filename
+    if endswith(filename, ext)
+        return filename
+    else
+        return filename * ext
+    end
 end
diff --git a/src/OutputWriters/windowed_time_average.jl b/src/OutputWriters/windowed_time_average.jl
index 914b33aa53..c36a7cf528 100644
--- a/src/OutputWriters/windowed_time_average.jl
+++ b/src/OutputWriters/windowed_time_average.jl
@@ -78,7 +78,7 @@ JLD2OutputWriter scheduled on TimeInterval(4 days):
 ├── array type: Array{Float64}
 ├── including: [:grid, :coriolis, :buoyancy, :closure]
 ├── file_splitting: NoFileSplitting
-└── file size: 26.5 KiB
+└── file size: 27.6 KiB
 ```
 """
 function AveragedTimeInterval(interval; window=interval, stride=1)
diff --git a/src/Simulations/callback.jl b/src/Simulations/callback.jl
index 4956ca3c0e..acc40486f1 100644
--- a/src/Simulations/callback.jl
+++ b/src/Simulations/callback.jl
@@ -14,12 +14,13 @@ end
 @inline (callback::Callback)(sim) = callback.func(sim, callback.parameters)
 @inline (callback::Callback{<:Nothing})(sim) = callback.func(sim)
 
-# Fallback initialization: initialize the schedule.
-# Then, if the schedule calls for it, execute the callback.
-function initialize!(callback::Callback, sim)
-    initialize!(callback.schedule, sim.model) && callback(sim)
-    return nothing
-end
+"""
+    initialize!(callback::Callback, sim)
+
+Initialize `callback`. By default, this does nothing, but
+can be optionally specialized on the type parameters of `Callback`.
+"""
+initialize!(callback::Callback, sim) = nothing
 
 """
     Callback(func, schedule=IterationInterval(1);
diff --git a/src/Simulations/run.jl b/src/Simulations/run.jl
index b29931b536..7d2d7a2aba 100644
--- a/src/Simulations/run.jl
+++ b/src/Simulations/run.jl
@@ -193,18 +193,26 @@ function initialize!(sim::Simulation)
     # Output and diagnostics initialization
     [add_dependencies!(sim.diagnostics, writer) for writer in values(sim.output_writers)]
 
+    # Initialize schedules
+    scheduled_activities = Iterators.flatten((values(sim.diagnostics),
+                                              values(sim.callbacks),
+                                              values(sim.output_writers)))
+
+    for activity in scheduled_activities
+        initialize!(activity.schedule, sim.model)
+    end
+
     # Reset! the model time-stepper, evaluate all diagnostics, and write all output at first iteration
     if clock.iteration == 0
         reset!(timestepper(sim.model))
 
         # Initialize schedules and run diagnostics, callbacks, and output writers
         for diag in values(sim.diagnostics)
-            diag.schedule(sim.model)
             run_diagnostic!(diag, model)
         end
 
         for callback in values(sim.callbacks) 
-            callback.callsite isa TimeStepCallsite && initialize!(callback, sim)
+            callback.callsite isa TimeStepCallsite && callback(sim)
         end
 
         for writer in values(sim.output_writers)
diff --git a/src/Simulations/simulation.jl b/src/Simulations/simulation.jl
index fc90daf99b..3b73d85702 100644
--- a/src/Simulations/simulation.jl
+++ b/src/Simulations/simulation.jl
@@ -6,8 +6,6 @@ import Oceananigans.Models: iteration
 import Oceananigans.Utils: prettytime
 import Oceananigans.TimeSteppers: reset!
 
-# It's not a model --- its a simulation!
-
 default_progress(simulation) = nothing
 
 mutable struct Simulation{ML, DT, ST, DI, OW, CB}
diff --git a/src/Simulations/time_step_wizard.jl b/src/Simulations/time_step_wizard.jl
index 18ec2b86fb..d8773d5e7f 100644
--- a/src/Simulations/time_step_wizard.jl
+++ b/src/Simulations/time_step_wizard.jl
@@ -35,7 +35,7 @@ for advective and diffusive Courant-Friedrichs-Lewy (CFL) numbers (`cfl` and `di
 subject to the limits
 
 ```julia
-max(min_Δt, min_change * previous_Δt) ≤ new_Δt ≤ min(max_Δt, max_change * previous_Δt)
+max(min_Δt, min_change * last_Δt) ≤ new_Δt ≤ min(max_Δt, max_change * last_Δt)
 ```
 
 where `new_Δt` is the new time step calculated by the `TimeStepWizard`.
diff --git a/src/Solvers/Solvers.jl b/src/Solvers/Solvers.jl
index c5a5bf13ce..bbad459ea9 100644
--- a/src/Solvers/Solvers.jl
+++ b/src/Solvers/Solvers.jl
@@ -4,7 +4,7 @@ export
     BatchedTridiagonalSolver, solve!,
     FFTBasedPoissonSolver,
     FourierTridiagonalPoissonSolver,
-    PreconditionedConjugateGradientSolver,
+    ConjugateGradientSolver,
     HeptadiagonalIterativeSolver
 
 using Statistics
@@ -14,12 +14,14 @@ using SparseArrays
 using KernelAbstractions
 
 using Oceananigans.Architectures: device, CPU, GPU, array_type, on_architecture
+using Oceananigans.BoundaryConditions: fill_halo_regions!
 using Oceananigans.Utils
 using Oceananigans.Grids
 using Oceananigans.BoundaryConditions
 using Oceananigans.Fields
 
-using Oceananigans.Grids: unpack_grid
+using Oceananigans.Grids: unpack_grid, inactive_cell
+using Oceananigans.Grids: XYRegularRG, XZRegularRG, YZRegularRG, XYZRegularRG
 
 """
     ω(M, k)
@@ -33,16 +35,24 @@ reshaped_size(N, dim) = dim == 1 ? (N, 1, 1) :
                         dim == 3 ? (1, 1, N) : nothing
 
 include("batched_tridiagonal_solver.jl")
+include("conjugate_gradient_solver.jl")
 include("poisson_eigenvalues.jl")
 include("index_permutations.jl")
 include("discrete_transforms.jl")
 include("plan_transforms.jl")
 include("fft_based_poisson_solver.jl")
 include("fourier_tridiagonal_poisson_solver.jl")
-include("preconditioned_conjugate_gradient_solver.jl")
+include("conjugate_gradient_poisson_solver.jl")
 include("sparse_approximate_inverse.jl")
 include("matrix_solver_utils.jl")
 include("sparse_preconditioners.jl")
 include("heptadiagonal_iterative_solver.jl")
 
+const GridWithFFTSolver = Union{XYZRegularRG, XYRegularRG, XZRegularRG, YZRegularRG}
+const GridWithFourierTridiagonalSolver = Union{XYRegularRG, XZRegularRG, YZRegularRG}
+
+fft_poisson_solver(grid::XYZRegularRG) = FFTBasedPoissonSolver(grid)
+fft_poisson_solver(grid::GridWithFourierTridiagonalSolver) =
+    FourierTridiagonalPoissonSolver(grid.underlying_grid)
+
 end # module
diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 6316fc15d2..2886847938 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -18,8 +18,12 @@ struct BatchedTridiagonalSolver{A, B, C, T, G, P, D}
     tridiagonal_direction :: D
 end
 
-architecture(solver::BatchedTridiagonalSolver) = architecture(solver.grid)
+# Some aliases...
+const XTridiagonalSolver = BatchedTridiagonalSolver{A, B, C, T, G, P, <:XDirection} where {A, B, C, T, G, P}
+const YTridiagonalSolver = BatchedTridiagonalSolver{A, B, C, T, G, P, <:YDirection} where {A, B, C, T, G, P}
+const ZTridiagonalSolver = BatchedTridiagonalSolver{A, B, C, T, G, P, <:ZDirection} where {A, B, C, T, G, P}
 
+architecture(solver::BatchedTridiagonalSolver) = architecture(solver.grid)
 
 """
     BatchedTridiagonalSolver(grid;
diff --git a/src/Solvers/conjugate_gradient_poisson_solver.jl b/src/Solvers/conjugate_gradient_poisson_solver.jl
new file mode 100644
index 0000000000..13ae437932
--- /dev/null
+++ b/src/Solvers/conjugate_gradient_poisson_solver.jl
@@ -0,0 +1,177 @@
+using Oceananigans.Operators: divᶜᶜᶜ, ∇²ᶜᶜᶜ 
+using Statistics: mean
+
+using KernelAbstractions: @kernel, @index
+
+import Oceananigans.Architectures: architecture
+
+struct ConjugateGradientPoissonSolver{G, R, S}
+    grid :: G
+    right_hand_side :: R
+    conjugate_gradient_solver :: S
+end
+
+architecture(solver::ConjugateGradientPoissonSolver) = architecture(cgps.grid)
+iteration(cgps::ConjugateGradientPoissonSolver) = iteration(cgps.conjugate_gradient_solver)
+
+Base.summary(ips::ConjugateGradientPoissonSolver) =
+    summary("ConjugateGradientPoissonSolver on ", summary(ips.grid))
+
+function Base.show(io::IO, ips::ConjugateGradientPoissonSolver)
+    A = architecture(ips.grid)
+    print(io, "ConjugateGradientPoissonSolver:", '\n',
+              "├── grid: ", summary(ips.grid), '\n',
+              "└── conjugate_gradient_solver: ", summary(ips.conjugate_gradient_solver), '\n',
+              "    ├── maxiter: ", prettysummary(ips.conjugate_gradient_solver.maxiter), '\n',
+              "    ├── reltol: ", prettysummary(ips.conjugate_gradient_solver.reltol), '\n',
+              "    ├── abstol: ", prettysummary(ips.conjugate_gradient_solver.abstol), '\n',
+              "    ├── preconditioner: ", prettysummary(ips.conjugate_gradient_solver.preconditioner), '\n',
+              "    └── iteration: ", prettysummary(ips.conjugate_gradient_solver.iteration))
+end
+
+@kernel function laplacian!(∇²ϕ, grid, ϕ)
+    i, j, k = @index(Global, NTuple)
+    @inbounds ∇²ϕ[i, j, k] = ∇²ᶜᶜᶜ(i, j, k, grid, ϕ)
+end
+
+function compute_laplacian!(∇²ϕ, ϕ)
+    grid = ϕ.grid
+    arch = architecture(grid)
+    fill_halo_regions!(ϕ)
+    launch!(arch, grid, :xyz, laplacian!, ∇²ϕ, grid, ϕ)
+    return nothing
+end
+
+struct DefaultPreconditioner end
+
+function ConjugateGradientPoissonSolver(grid;
+                                        preconditioner = DefaultPreconditioner(),
+                                        reltol = sqrt(eps(grid)),
+                                        abstol = sqrt(eps(grid)),
+                                        kw...)
+
+    if preconditioner isa DefaultPreconditioner # try to make a useful default
+        if grid isa ImmersedBoundaryGrid && grid.underlying_grid isa GridWithFFTSolver
+            preconditioner = fft_poisson_solver(grid.underlying_grid)
+        else
+            preconditioner = DiagonallyDominantPreconditioner()
+        end
+    end
+
+    rhs = CenterField(grid)
+
+    conjugate_gradient_solver = ConjugateGradientSolver(compute_laplacian!;
+                                                        reltol,
+                                                        abstol,
+                                                        preconditioner,
+                                                        template_field = rhs,
+                                                        kw...)
+        
+    return ConjugateGradientPoissonSolver(grid, rhs, conjugate_gradient_solver)
+end
+
+#####
+##### A preconditioner based on the FFT solver
+#####
+
+@kernel function fft_preconditioner_rhs!(preconditioner_rhs, rhs)
+    i, j, k = @index(Global, NTuple)
+    @inbounds preconditioner_rhs[i, j, k] = rhs[i, j, k]
+end
+
+@kernel function fourier_tridiagonal_preconditioner_rhs!(preconditioner_rhs, ::XDirection, grid, rhs)
+    i, j, k = @index(Global, NTuple)
+    @inbounds preconditioner_rhs[i, j, k] = Δxᶜᶜᶜ(i, j, k, grid) * rhs[i, j, k]
+end
+
+@kernel function fourier_tridiagonal_preconditioner_rhs!(preconditioner_rhs, ::YDirection, grid, rhs)
+    i, j, k = @index(Global, NTuple)
+    @inbounds preconditioner_rhs[i, j, k] = Δyᶜᶜᶜ(i, j, k, grid) * rhs[i, j, k]
+end
+
+@kernel function fourier_tridiagonal_preconditioner_rhs!(preconditioner_rhs, ::ZDirection, grid, rhs)
+    i, j, k = @index(Global, NTuple)
+    @inbounds preconditioner_rhs[i, j, k] = Δzᶜᶜᶜ(i, j, k, grid) * rhs[i, j, k]
+end
+
+function compute_preconditioner_rhs!(solver::FFTBasedPoissonSolver, rhs)
+    grid = solver.grid
+    arch = architecture(grid)
+    launch!(arch, grid, :xyz, fft_preconditioner_rhs!, solver.storage, rhs)
+    return nothing
+end
+
+function compute_preconditioner_rhs!(solver::FourierTridiagonalPoissonSolver, rhs)
+    grid = solver.grid
+    arch = architecture(grid)
+    tridiagonal_dir = solver.batched_tridiagonal_solver.tridiagonal_direction
+    launch!(arch, grid, :xyz, fourier_tridiagonal_preconditioner_rhs!,
+            solver.storage, tridiagonal_dir, rhs)
+    return nothing
+end
+
+const FFTBasedPreconditioner = Union{FFTBasedPoissonSolver, FourierTridiagonalPoissonSolver}
+
+function precondition!(p, preconditioner::FFTBasedPreconditioner, r, args...)
+    compute_preconditioner_rhs!(preconditioner, r)
+    p = solve!(p, preconditioner)
+
+    mean_p = mean(p)
+    grid = p.grid
+    arch = architecture(grid)
+    launch!(arch, grid, :xyz, subtract_and_mask!, p, grid, mean_p)
+
+    return p
+end
+
+@kernel function subtract_and_mask!(a, grid, b)
+    i, j, k = @index(Global, NTuple)
+    active = !inactive_cell(i, j, k, grid)
+    a[i, j, k] = (a[i, j, k] - b) * active
+end
+
+#####
+##### The "DiagonallyDominantPreconditioner" (Marshall et al 1997)
+#####
+
+struct DiagonallyDominantPreconditioner end
+Base.summary(::DiagonallyDominantPreconditioner) = "DiagonallyDominantPreconditioner"
+
+@inline function precondition!(p, ::DiagonallyDominantPreconditioner, r, args...)
+    grid = r.grid
+    arch = architecture(p)
+    fill_halo_regions!(r)
+    launch!(arch, grid, :xyz, _diagonally_dominant_precondition!, p, grid, r)
+
+    mean_p = mean(p)
+    launch!(arch, grid, :xyz, subtract_and_mask!, p, grid, mean_p)
+
+    return p
+end
+
+# Kernels that calculate coefficients for the preconditioner
+@inline Ax⁻(i, j, k, grid) = Axᶠᶜᶜ(i,   j, k, grid) / Δxᶠᶜᶜ(i,   j, k, grid) / Vᶜᶜᶜ(i, j, k, grid)
+@inline Ax⁺(i, j, k, grid) = Axᶠᶜᶜ(i+1, j, k, grid) / Δxᶠᶜᶜ(i+1, j, k, grid) / Vᶜᶜᶜ(i, j, k, grid)
+@inline Ay⁻(i, j, k, grid) = Ayᶜᶠᶜ(i, j,   k, grid) / Δyᶜᶠᶜ(i, j,   k, grid) / Vᶜᶜᶜ(i, j, k, grid)
+@inline Ay⁺(i, j, k, grid) = Ayᶜᶠᶜ(i, j+1, k, grid) / Δyᶜᶠᶜ(i, j+1, k, grid) / Vᶜᶜᶜ(i, j, k, grid)
+@inline Az⁻(i, j, k, grid) = Azᶜᶜᶠ(i, j, k,   grid) / Δzᶜᶜᶠ(i, j, k,   grid) / Vᶜᶜᶜ(i, j, k, grid)
+@inline Az⁺(i, j, k, grid) = Azᶜᶜᶠ(i, j, k+1, grid) / Δzᶜᶜᶠ(i, j, k+1, grid) / Vᶜᶜᶜ(i, j, k, grid)
+
+@inline Ac(i, j, k, grid) = - Ax⁻(i, j, k, grid) - Ax⁺(i, j, k, grid) -
+                              Ay⁻(i, j, k, grid) - Ay⁺(i, j, k, grid) -
+                              Az⁻(i, j, k, grid) - Az⁺(i, j, k, grid)
+                              
+@inline heuristic_residual(i, j, k, grid, r) =
+    @inbounds 1 / Ac(i, j, k, grid) * (r[i, j, k] - 2 * Ax⁻(i, j, k, grid) / (Ac(i, j, k, grid) + Ac(i-1, j, k, grid)) * r[i-1, j, k] -
+                                                    2 * Ax⁺(i, j, k, grid) / (Ac(i, j, k, grid) + Ac(i+1, j, k, grid)) * r[i+1, j, k] -
+                                                    2 * Ay⁻(i, j, k, grid) / (Ac(i, j, k, grid) + Ac(i, j-1, k, grid)) * r[i, j-1, k] -
+                                                    2 * Ay⁺(i, j, k, grid) / (Ac(i, j, k, grid) + Ac(i, j+1, k, grid)) * r[i, j+1, k] -
+                                                    2 * Az⁻(i, j, k, grid) / (Ac(i, j, k, grid) + Ac(i, j, k-1, grid)) * r[i, j, k-1] -
+                                                    2 * Az⁺(i, j, k, grid) / (Ac(i, j, k, grid) + Ac(i, j, k+1, grid)) * r[i, j, k+1])
+
+@kernel function _diagonally_dominant_precondition!(p, grid, r)
+    i, j, k = @index(Global, NTuple)
+    active = !inactive_cell(i, j, k, grid)
+    @inbounds p[i, j, k] = heuristic_residual(i, j, k, grid, r) * active
+end
+
diff --git a/src/Solvers/preconditioned_conjugate_gradient_solver.jl b/src/Solvers/conjugate_gradient_solver.jl
similarity index 79%
rename from src/Solvers/preconditioned_conjugate_gradient_solver.jl
rename to src/Solvers/conjugate_gradient_solver.jl
index 413e649e42..4d15d1ffbc 100644
--- a/src/Solvers/preconditioned_conjugate_gradient_solver.jl
+++ b/src/Solvers/conjugate_gradient_solver.jl
@@ -6,39 +6,42 @@ using LinearAlgebra
 
 import Oceananigans.Architectures: architecture
 
-mutable struct PreconditionedConjugateGradientSolver{A, G, L, T, F, M, P} 
-               architecture :: A
-                       grid :: G
-          linear_operation! :: L
-                     reltol :: T
-                     abstol :: T
-                    maxiter :: Int
-                  iteration :: Int
-                       ρⁱ⁻¹ :: T
+mutable struct ConjugateGradientSolver{A, G, L, T, F, M, P} 
+    architecture :: A
+    grid :: G
+    linear_operation! :: L
+    reltol :: T
+    abstol :: T
+    maxiter :: Int
+    iteration :: Int
+    ρⁱ⁻¹ :: T
     linear_operator_product :: F
-           search_direction :: F
-                   residual :: F
-             preconditioner :: M
-     preconditioner_product :: P
+    search_direction :: F
+    residual :: F
+    preconditioner :: M
+    preconditioner_product :: P
 end
 
-architecture(solver::PreconditionedConjugateGradientSolver) = solver.architecture
+architecture(solver::ConjugateGradientSolver) = solver.architecture
+iteration(cgs::ConjugateGradientSolver) = cgs.iteration
 
 initialize_precondition_product(preconditioner, template_field) = similar(template_field)
 initialize_precondition_product(::Nothing, template_field) = nothing
 
+Base.summary(::ConjugateGradientSolver) = "ConjugateGradientSolver"
+
 # "Nothing" preconditioner
 @inline precondition!(z, ::Nothing, r, args...) = r
 
 """
-    PreconditionedConjugateGradientSolver(linear_operation;
+    ConjugateGradientSolver(linear_operation;
                                           template_field,
                                           maxiter = size(template_field.grid),
-                                          reltol = sqrt(eps(eltype(template_field.grid))),
+                                          reltol = sqrt(eps(template_field.grid)),
                                           abstol = 0,
                                           preconditioner = nothing)
 
-Returns a `PreconditionedConjugateGradientSolver` that solves the linear equation
+Returns a `ConjugateGradientSolver` that solves the linear equation
 ``A x = b`` using a iterative conjugate gradient method with optional preconditioning.
 
 The solver is used by calling
@@ -70,7 +73,7 @@ Arguments
 
 See [`solve!`](@ref) for more information about the preconditioned conjugate-gradient algorithm.
 """
-function PreconditionedConjugateGradientSolver(linear_operation;
+function ConjugateGradientSolver(linear_operation;
                                                template_field::AbstractField,
                                                maxiter = prod(size(template_field)),
                                                reltol = sqrt(eps(eltype(template_field.grid))),
@@ -90,7 +93,7 @@ function PreconditionedConjugateGradientSolver(linear_operation;
 
     FT = eltype(grid)
 
-    return PreconditionedConjugateGradientSolver(arch,
+    return ConjugateGradientSolver(arch,
                                                  grid,
                                                  linear_operation,
                                                  FT(reltol),
@@ -106,7 +109,7 @@ function PreconditionedConjugateGradientSolver(linear_operation;
 end
 
 """
-    solve!(x, solver::PreconditionedConjugateGradientSolver, b, args...)
+    solve!(x, solver::ConjugateGradientSolver, b, args...)
 
 Solve `A * x = b` using an iterative conjugate-gradient method, where `A * x` is
 determined by `solver.linear_operation`
@@ -154,7 +157,7 @@ Loop:
      ρⁱ⁻¹ = ρ
 ```
 """
-function solve!(x, solver::PreconditionedConjugateGradientSolver, b, args...)
+function solve!(x, solver::ConjugateGradientSolver, b, args...)
 
     # Initialize
     solver.iteration = 0
@@ -167,8 +170,8 @@ function solve!(x, solver::PreconditionedConjugateGradientSolver, b, args...)
     residual_norm = norm(solver.residual)
     tolerance = max(solver.reltol * residual_norm, solver.abstol)
 
-    @debug "PreconditionedConjugateGradientSolver, |b|: $(norm(b))"
-    @debug "PreconditionedConjugateGradientSolver, |A * x|: $(norm(q))"
+    @debug "ConjugateGradientSolver, |b|: $(norm(b))"
+    @debug "ConjugateGradientSolver, |A * x|: $(norm(q))"
 
     while iterating(solver, tolerance)
         iterate!(x, solver, b, args...)
@@ -182,7 +185,7 @@ function iterate!(x, solver, b, args...)
     p = solver.search_direction
     q = solver.linear_operator_product
 
-    @debug "PreconditionedConjugateGradientSolver $(solver.iteration), |r|: $(norm(r))"
+    @debug "ConjugateGradientSolver $(solver.iteration), |r|: $(norm(r))"
 
     # Preconditioned:   z = P * r
     # Unpreconditioned: z = r
@@ -190,8 +193,8 @@ function iterate!(x, solver, b, args...)
 
     ρ = dot(z, r)
 
-    @debug "PreconditionedConjugateGradientSolver $(solver.iteration), ρ: $ρ"
-    @debug "PreconditionedConjugateGradientSolver $(solver.iteration), |z|: $(norm(z))"
+    @debug "ConjugateGradientSolver $(solver.iteration), ρ: $ρ"
+    @debug "ConjugateGradientSolver $(solver.iteration), |z|: $(norm(z))"
 
     @apply_regionally perform_iteration!(q, p, ρ, z, solver, args...)
 
@@ -201,8 +204,8 @@ function iterate!(x, solver, b, args...)
 
     α = ρ / dot(p, q)
 
-    @debug "PreconditionedConjugateGradientSolver $(solver.iteration), |q|: $(norm(q))"
-    @debug "PreconditionedConjugateGradientSolver $(solver.iteration), α: $α"
+    @debug "ConjugateGradientSolver $(solver.iteration), |q|: $(norm(q))"
+    @debug "ConjugateGradientSolver $(solver.iteration), α: $α"
         
     @apply_regionally update_solution_and_residuals!(x, r, q, p, α)
 
@@ -234,7 +237,7 @@ function perform_iteration!(q, p, ρ, z, solver, args...)
         β = ρ / solver.ρⁱ⁻¹
         pp .= zp .+ β .* pp
 
-        @debug "PreconditionedConjugateGradientSolver $(solver.iteration), β: $β"
+        @debug "ConjugateGradientSolver $(solver.iteration), β: $β"
     end
 
     # q = A * p
@@ -262,8 +265,8 @@ function iterating(solver, tolerance)
     return true
 end
 
-function Base.show(io::IO, solver::PreconditionedConjugateGradientSolver)
-    print(io, "PreconditionedConjugateGradientSolver on ", summary(solver.architecture), "\n",
+function Base.show(io::IO, solver::ConjugateGradientSolver)
+    print(io, "ConjugateGradientSolver on ", summary(solver.architecture), "\n",
               "├── template_field: ", summary(solver.residual), "\n",
               "├── grid: ", summary(solver.grid), "\n",
               "├── linear_operation!: ", prettysummary(solver.linear_operation!), "\n",
diff --git a/src/Solvers/discrete_transforms.jl b/src/Solvers/discrete_transforms.jl
index 9cf20a3c88..119ca0577d 100644
--- a/src/Solvers/discrete_transforms.jl
+++ b/src/Solvers/discrete_transforms.jl
@@ -1,4 +1,4 @@
-import Oceananigans.Architectures: architecture
+import Oceananigans.Architectures: architecture, child_architecture
 
 abstract type AbstractTransformDirection end
 
@@ -16,7 +16,8 @@ struct DiscreteTransform{P, D, G, Δ, Ω, N, T, Σ}
      transpose_dims :: Σ
 end
 
-architecture(transform::DiscreteTransform) = architecture(transform.grid)
+# Includes support for distributed architectures
+architecture(transform::DiscreteTransform) = child_architecture(architecture(transform.grid))
 
 #####
 ##### Normalization factors
@@ -80,7 +81,7 @@ end
 NoTransform() = DiscreteTransform([nothing for _ in fieldnames(DiscreteTransform)]...)
 
 function DiscreteTransform(plan, direction, grid, dims)
-    arch = architecture(grid)
+    arch = child_architecture(grid) # In case we are doing it on a DistributedGrid
 
     isnothing(plan) && return NoTransform()
 
@@ -144,7 +145,7 @@ end
 
 function apply_transform!(A, B, plan, transpose_dims)
     old_size = size(A)
-    transposed_size = [old_size[d] for d in transpose_dims]
+    transposed_size = Tuple(old_size[d] for d in transpose_dims)
 
     if old_size == transposed_size
         permutedims!(B, A, transpose_dims)
diff --git a/src/Solvers/fft_based_poisson_solver.jl b/src/Solvers/fft_based_poisson_solver.jl
index 084562471c..3f5fae67e3 100644
--- a/src/Solvers/fft_based_poisson_solver.jl
+++ b/src/Solvers/fft_based_poisson_solver.jl
@@ -21,14 +21,16 @@ function transform_list_str(transform_list)
     return list
 end
 
+Base.summary(solver::FFTBasedPoissonSolver) = "FFTBasedPoissonSolver"
+
 Base.show(io::IO, solver::FFTBasedPoissonSolver) =
-print(io, "FFTBasedPoissonSolver on ", string(typeof(architecture(solver))), ": \n",
-          "├── grid: $(summary(solver.grid))\n",
-          "├── storage: $(typeof(solver.storage))\n",
-          "├── buffer: $(typeof(solver.buffer))\n",
-          "└── transforms:\n",
-          "    ├── forward: ", transform_list_str(solver.transforms.forward), "\n",
-          "    └── backward: ", transform_list_str(solver.transforms.backward))
+    print(io, "FFTBasedPoissonSolver on ", string(typeof(architecture(solver))), ": \n",
+              "├── grid: $(summary(solver.grid))\n",
+              "├── storage: $(typeof(solver.storage))\n",
+              "├── buffer: $(typeof(solver.buffer))\n",
+              "└── transforms:\n",
+              "    ├── forward: ", transform_list_str(solver.transforms.forward), "\n",
+              "    └── backward: ", transform_list_str(solver.transforms.backward))
 
 """
     FFTBasedPoissonSolver(grid, planner_flag=FFTW.PATIENT)
@@ -90,7 +92,7 @@ elements (typically the same type as `solver.storage`).
     Equation ``(∇² + m) ϕ = b`` is sometimes referred to as the "screened Poisson" equation
     when ``m < 0``, or the Helmholtz equation when ``m > 0``.
 """
-function solve!(ϕ, solver::FFTBasedPoissonSolver, b, m=0)
+function solve!(ϕ, solver::FFTBasedPoissonSolver, b=solver.storage, m=0)
     arch = architecture(solver)
     topo = TX, TY, TZ = topology(solver.grid)
     Nx, Ny, Nz = size(solver.grid)
@@ -100,7 +102,9 @@ function solve!(ϕ, solver::FFTBasedPoissonSolver, b, m=0)
     ϕc = solver.storage
 
     # Transform b *in-place* to eigenfunction space
-    [transform!(b, solver.buffer) for transform! in solver.transforms.forward]
+    for transform! in solver.transforms.forward
+        transform!(b, solver.buffer)
+    end
 
     # Solve the discrete screened Poisson equation (∇² + m) ϕ = b.
     @. ϕc = - b / (λx + λy + λz - m)
@@ -111,7 +115,9 @@ function solve!(ϕ, solver::FFTBasedPoissonSolver, b, m=0)
     m === 0 && CUDA.@allowscalar ϕc[1, 1, 1] = 0
 
     # Apply backward transforms in order
-    [transform!(ϕc, solver.buffer) for transform! in solver.transforms.backward]
+    for transform! in solver.transforms.backward
+        transform!(ϕc, solver.buffer)
+    end
 
     launch!(arch, solver.grid, :xyz, copy_real_component!, ϕ, ϕc, indices(ϕ))
     
@@ -129,3 +135,4 @@ end
 
     @inbounds ϕ[i′, j′, k′] = real(ϕc[i, j, k])
 end
+
diff --git a/src/Solvers/fourier_tridiagonal_poisson_solver.jl b/src/Solvers/fourier_tridiagonal_poisson_solver.jl
index 1971ab8d52..665489e120 100644
--- a/src/Solvers/fourier_tridiagonal_poisson_solver.jl
+++ b/src/Solvers/fourier_tridiagonal_poisson_solver.jl
@@ -50,7 +50,6 @@ end
     @inbounds D[i, j, Nz] = -1 / Δzᵃᵃᶠ(i, j, Nz, grid) - Δzᵃᵃᶜ(i, j, Nz, grid) * (λx[i] + λy[j])
 end
 
-
 stretched_direction(::YZRegularRG) = XDirection()
 stretched_direction(::XZRegularRG) = YDirection()
 stretched_direction(::XYRegularRG) = ZDirection()
@@ -120,13 +119,17 @@ function solve!(x, solver::FourierTridiagonalPoissonSolver, b=nothing)
     ϕ = solver.storage
 
     # Apply forward transforms in order
-    [transform!(solver.source_term, solver.buffer) for transform! in solver.transforms.forward]
+    for transform! in solver.transforms.forward
+        transform!(solver.source_term, solver.buffer)
+    end
 
     # Solve tridiagonal system of linear equations at every column.
     solve!(ϕ, solver.batched_tridiagonal_solver, solver.source_term)
 
     # Apply backward transforms in order
-    [transform!(ϕ, solver.buffer) for transform! in solver.transforms.backward]
+    for transform! in solver.transforms.backward
+        transform!(ϕ, solver.buffer)
+    end
 
     # Set the volume mean of the solution to be zero.
     # Solutions to Poisson's equation are only unique up to a constant (the global mean
@@ -149,9 +152,7 @@ function set_source_term!(solver::FourierTridiagonalPoissonSolver, source_term)
     grid = solver.grid
     arch = architecture(solver)
     solver.source_term .= source_term
-
     launch!(arch, grid, :xyz, multiply_by_stretched_spacing!, solver.source_term, grid)
-
     return nothing
 end
 
@@ -170,3 +171,4 @@ end
     i, j, k = @index(Global, NTuple)
     @inbounds a[i, j, k] *= Δzᵃᵃᶜ(i, j, k, grid)
 end
+
diff --git a/src/Solvers/heptadiagonal_iterative_solver.jl b/src/Solvers/heptadiagonal_iterative_solver.jl
index d3fb92f86a..c67a500ef9 100644
--- a/src/Solvers/heptadiagonal_iterative_solver.jl
+++ b/src/Solvers/heptadiagonal_iterative_solver.jl
@@ -21,7 +21,7 @@ mutable struct HeptadiagonalIterativeSolver{G, R, L, D, M, P, PM, PS, I, ST, T,
            iterative_solver :: I
                  state_vars :: ST
                   tolerance :: T
-                previous_Δt :: F
+                    last_Δt :: F
          maximum_iterations :: Int
                     verbose :: Bool
 end
@@ -70,7 +70,7 @@ The matrix constructors are calculated based on the pentadiagonal coeffients pas
 to `matrix_from_coefficients` function.
 
 To allow for variable time step, the diagonal term `- Az / (g * Δt²)` is only added later on
-and it is updated only when the previous time step changes (`previous_Δt != Δt`).
+and it is updated only when the previous time step changes (`last_Δt != Δt`).
 
 Preconditioning is done through the various methods implemented in `Solvers/sparse_preconditioners.jl`.
     
@@ -296,7 +296,7 @@ function solve!(x, solver::HeptadiagonalIterativeSolver, b, Δt)
     arch = architecture(solver.matrix)
     
     # update matrix and preconditioner if time step changes
-    if Δt != solver.previous_Δt
+    if Δt != solver.last_Δt
         constructors = deepcopy(solver.matrix_constructors)
         M = prod(solver.problem_size)
         update_diag!(constructors, arch, M, M, solver.diagonal, Δt, 0)
@@ -308,7 +308,7 @@ function solve!(x, solver::HeptadiagonalIterativeSolver, b, Δt)
                                                          solver.matrix,
                                                          solver.preconditioner_settings)
 
-        solver.previous_Δt = Δt
+        solver.last_Δt = Δt
     end
     
     solver.iterative_solver(x, solver.matrix, b, 
diff --git a/src/Solvers/plan_transforms.jl b/src/Solvers/plan_transforms.jl
index 7fd3b8a703..78608cfc2f 100644
--- a/src/Solvers/plan_transforms.jl
+++ b/src/Solvers/plan_transforms.jl
@@ -145,7 +145,6 @@ function plan_transforms(grid::XYZRegularRG, storage, planner_flag)
     return transforms
 end
 
-
 """ Used by FourierTridiagonalPoissonSolver. """
 function plan_transforms(grid::Union{XYRegularRG, XZRegularRG, YZRegularRG}, storage, planner_flag)
     Nx, Ny, Nz = size(grid)
diff --git a/src/TimeSteppers/TimeSteppers.jl b/src/TimeSteppers/TimeSteppers.jl
index a935da5c20..2955ca6999 100644
--- a/src/TimeSteppers/TimeSteppers.jl
+++ b/src/TimeSteppers/TimeSteppers.jl
@@ -35,7 +35,8 @@ julia> stepper = TimeStepper(:QuasiAdamsBashforth2, CPU(), grid, tracernames)
 """
 function TimeStepper(name::Symbol, args...; kwargs...)
     fullname = Symbol(name, :TimeStepper)
-    return @eval $fullname($args...; $kwargs...)
+    TS = getglobal(@__MODULE__, fullname)
+    return TS(args...; kwargs...)
 end
 
 # Fallback
diff --git a/src/TimeSteppers/clock.jl b/src/TimeSteppers/clock.jl
index b1b8da173b..48d8e5172f 100644
--- a/src/TimeSteppers/clock.jl
+++ b/src/TimeSteppers/clock.jl
@@ -13,32 +13,61 @@ The `stage` is updated only for multi-stage time-stepping methods. The `time::T`
 either a number or a `DateTime` object.
 """
 mutable struct Clock{TT, DT}
-         time :: TT
-      last_Δt :: DT
+    time :: TT
+    last_Δt :: DT
+    last_stage_Δt :: DT
     iteration :: Int
-        stage :: Int
+    stage :: Int
 end
 
 """
-    Clock(; time, last_Δt = Inf, iteration=0, stage=1)
+    Clock(; time, last_Δt=Inf, last_stage_Δt=Inf, iteration=0, stage=1)
 
 Returns a `Clock` object. By default, `Clock` is initialized to the zeroth `iteration`
-and first time step `stage` with `last_Δt`.
+and first time step `stage` with `last_Δt=last_stage_Δt=Inf`.
 """
-Clock(; time::TT, last_Δt::DT=Inf, iteration=0, stage=1) where {TT, DT} = Clock{TT, DT}(time, last_Δt, iteration, stage)
+function Clock(; time,
+               last_Δt = Inf,
+               last_stage_Δt = Inf,
+               iteration = 0,
+               stage = 1)
+
+    TT = typeof(time)
+    DT = typeof(last_Δt)
+    last_stage_Δt = convert(DT, last_Δt)
+    return Clock{TT, DT}(time, last_Δt, last_stage_Δt, iteration, stage)
+end
+
 # TODO: when supporting DateTime, this function will have to be extended
 time_step_type(TT) = TT
 
-function Clock{TT}(; time, last_Δt=Inf, iteration=0, stage=1) where TT
+function Clock{TT}(; time,
+                   last_Δt = Inf,
+                   last_stage_Δt = Inf,
+                   iteration = 0,
+                   stage = 1) where TT
+
     DT = time_step_type(TT)
     last_Δt = convert(DT, last_Δt)
-    return Clock{TT, DT}(time, last_Δt, iteration, stage)
+    last_stage_Δt = convert(DT, last_stage_Δt)
+
+    return Clock{TT, DT}(time, last_Δt, last_stage_Δt, iteration, stage)
 end
 
-Base.summary(clock::Clock) = string("Clock(time=$(prettytime(clock.time)), iteration=$(clock.iteration), last_Δt=$(prettytime(clock.last_Δt)))")
+function Base.summary(clock::Clock)
+    TT = typeof(clock.time)
+    DT = typeof(clock.last_Δt)
+    return string("Clock{", TT, ", ", DT, "}",
+                  "(time=", prettytime(clock.time),
+                  ", iteration=", clock.iteration,
+                  ", last_Δt=", prettytime(clock.last_Δt), ")")
+end
 
-Base.show(io::IO, c::Clock{TT, DT}) where {TT, DT} =
-    println(io, "Clock{$TT, $DT}: time = $(prettytime(c.time)), last_Δt = $(prettytime(c.last_Δt)), iteration = $(c.iteration), stage = $(c.stage)")
+function Base.show(io::IO, clock::Clock)
+    return print(io, summary(clock), '\n',
+                 "├── stage: ", clock.stage, '\n',
+                 "└── last_stage_Δt: ", prettytime(clock.last_stage_Δt))
+end
 
 next_time(clock, Δt) = clock.time + Δt
 next_time(clock::Clock{<:AbstractTime}, Δt) = clock.time + Nanosecond(round(Int, 1e9 * Δt))
@@ -61,8 +90,6 @@ function tick!(clock, Δt; stage=false)
 
     tick_time!(clock, Δt)
 
-    clock.last_Δt = Δt
-
     if stage # tick a stage update
         clock.stage += 1
     else # tick an iteration and reset stage
@@ -73,7 +100,11 @@ function tick!(clock, Δt; stage=false)
     return nothing
 end
 
-"Adapt `Clock` to work on the GPU via CUDAnative and CUDAdrv."
-Adapt.adapt_structure(to, clock::Clock) =
-    (time=clock.time, last_Δt=clock.last_Δt, iteration=clock.iteration, stage=clock.stage)
+"""Adapt `Clock` for GPU."""
+Adapt.adapt_structure(to, clock::Clock) = (time          = clock.time,
+                                           last_Δt       = clock.last_Δt,
+                                           last_stage_Δt = clock.last_stage_Δt,
+                                           iteration     = clock.iteration,
+                                           stage         = clock.stage)
+    
 
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index d4a0dbdac4..ec047f4a61 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -3,7 +3,6 @@ using Oceananigans.Utils: @apply_regionally, apply_regionally!
 
 mutable struct QuasiAdamsBashforth2TimeStepper{FT, GT, IT} <: AbstractTimeStepper
                   χ :: FT
-        previous_Δt :: FT
                  Gⁿ :: GT
                  G⁻ :: GT
     implicit_solver :: IT
@@ -48,33 +47,40 @@ function QuasiAdamsBashforth2TimeStepper(grid, tracers,
     GT = typeof(Gⁿ)
     χ  = convert(FT, χ)
 
-    return QuasiAdamsBashforth2TimeStepper{FT, GT, IT}(χ, Inf, Gⁿ, G⁻, implicit_solver)
+    return QuasiAdamsBashforth2TimeStepper{FT, GT, IT}(χ, Gⁿ, G⁻, implicit_solver)
 end
 
-function reset!(timestepper::QuasiAdamsBashforth2TimeStepper)
-    timestepper.previous_Δt = Inf
-    return nothing
-end
+reset!(timestepper::QuasiAdamsBashforth2TimeStepper) = nothing
 
 #####
 ##### Time steppping
 #####
 
 """
-    time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt; euler=false, compute_tendencies=true)
+    time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt; euler=false)
 
 Step forward `model` one time step `Δt` with a 2nd-order Adams-Bashforth method and
 pressure-correction substep. Setting `euler=true` will take a forward Euler time step.
-Setting `compute_tendencies=false` will not calculate new tendencies
+The tendencies are calculated by the `update_step!` at the end of the `time_step!` function.
+
+The steps of the Quasi-Adams-Bashforth second-order (AB2) algorithm are:
+
+1. If this the first time step (`model.clock.iteration == 0`), then call `update_state!` and calculate the tendencies.
+2. Advance tracers in time and compute predictor velocities (including implicit vertical diffusion).
+3. Solve the elliptic equation for pressure (three dimensional for the non-hydrostatic model, two-dimensional for the hydrostatic model).
+4. Correct the velocities based on the results of step 3.
+5. Store the old tendencies.
+6. Update the model state.
+7. Compute tendencies for the next time step
 """
 function time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt;
-                    callbacks=[], euler=false, compute_tendencies=true)
+                    callbacks=[], euler=false)
+
+    Δt == 0 && @warn "Δt == 0 may cause model blowup!"
 
     # Be paranoid and update state at iteration 0
     model.clock.iteration == 0 && update_state!(model, callbacks)
 
-    Δt == 0 && @warn "Δt == 0 may cause model blowup!"
-
     ab2_timestepper = model.timestepper
 
     # Change the default χ if necessary, which occurs if:
@@ -83,16 +89,15 @@ function time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt
     #     need to take an euler step. Note that model.clock.last_Δt is
     #     initialized as Inf
     #   * The user has passed euler=true to time_step!
-    χ₀ = ab2_timestepper.χ
-    euler = euler || (Δt != ab2_timestepper.previous_Δt)
+    euler = euler || (Δt != model.clock.last_Δt)
     
     # If euler, then set χ = -0.5
     minus_point_five = convert(eltype(model.grid), -0.5)
     χ = ifelse(euler, minus_point_five, ab2_timestepper.χ)
 
     # Set time-stepper χ (this is used in ab2_step!, but may also be used elsewhere)
+    χ₀ = ab2_timestepper.χ # Save initial value
     ab2_timestepper.χ = χ
-    ab2_timestepper.previous_Δt = Δt
 
     # Ensure zeroing out all previous tendency fields to avoid errors in
     # case G⁻ includes NaNs. See https://github.com/CliMA/Oceananigans.jl/issues/2259
@@ -103,14 +108,19 @@ function time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt
         end
     end
 
-    model.clock.iteration == 0 && update_state!(model, callbacks)
-
+    # Be paranoid and update state at iteration 0
+    model.clock.iteration == 0 && update_state!(model, callbacks; compute_tendencies=true)
+    
     ab2_step!(model, Δt) # full step for tracers, fractional step for velocities.
+    
+    tick!(model.clock, Δt)
+    model.clock.last_Δt = Δt
+    model.clock.last_stage_Δt = Δt # just one stage
+    
     calculate_pressure_correction!(model, Δt)
     @apply_regionally correct_velocities_and_store_tendencies!(model, Δt)
 
-    tick!(model.clock, Δt)
-    update_state!(model, callbacks; compute_tendencies)
+    update_state!(model, callbacks; compute_tendencies=true)
     step_lagrangian_particles!(model, Δt)
 
     # Return χ to initial value
@@ -176,3 +186,4 @@ Time step velocity fields via the 2nd-order quasi Adams-Bashforth method
 end
 
 @kernel ab2_step_field!(::FunctionField, Δt, χ, Gⁿ, G⁻) = nothing
+
diff --git a/src/TimeSteppers/runge_kutta_3.jl b/src/TimeSteppers/runge_kutta_3.jl
index 9c706c88ff..0bc5c34608 100644
--- a/src/TimeSteppers/runge_kutta_3.jl
+++ b/src/TimeSteppers/runge_kutta_3.jl
@@ -78,11 +78,11 @@ The 3rd-order Runge-Kutta method takes three intermediate substep stages to
 achieve a single timestep. A pressure correction step is applied at each intermediate
 stage.
 """
-function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbacks=[], compute_tendencies = true)
+function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbacks=[])
     Δt == 0 && @warn "Δt == 0 may cause model blowup!"
 
     # Be paranoid and update state at iteration 0, in case run! is not used:
-    model.clock.iteration == 0 && update_state!(model, callbacks)
+    model.clock.iteration == 0 && update_state!(model, callbacks; compute_tendencies = true)
 
     γ¹ = model.timestepper.γ¹
     γ² = model.timestepper.γ²
@@ -95,18 +95,23 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
     second_stage_Δt = (γ² + ζ²) * Δt
     third_stage_Δt  = (γ³ + ζ³) * Δt
 
+    # Compute the next time step a priori to reduce floating point error accumulation
+    tⁿ⁺¹ = next_time(model.clock, Δt)
+
     #
     # First stage
     #
 
     rk3_substep!(model, Δt, γ¹, nothing)
 
+    tick!(model.clock, first_stage_Δt; stage=true)
+    model.clock.last_stage_Δt = first_stage_Δt
+
     calculate_pressure_correction!(model, first_stage_Δt)
     pressure_correct_velocities!(model, first_stage_Δt)
 
-    tick!(model.clock, first_stage_Δt; stage=true)
     store_tendencies!(model)
-    update_state!(model, callbacks)
+    update_state!(model, callbacks; compute_tendencies = true)
     step_lagrangian_particles!(model, first_stage_Δt)
 
     #
@@ -115,12 +120,14 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
 
     rk3_substep!(model, Δt, γ², ζ²)
 
+    tick!(model.clock, second_stage_Δt; stage=true)
+    model.clock.last_stage_Δt = second_stage_Δt
+
     calculate_pressure_correction!(model, second_stage_Δt)
     pressure_correct_velocities!(model, second_stage_Δt)
 
-    tick!(model.clock, second_stage_Δt; stage=true)
     store_tendencies!(model)
-    update_state!(model, callbacks)
+    update_state!(model, callbacks; compute_tendencies = true)
     step_lagrangian_particles!(model, second_stage_Δt)
 
     #
@@ -129,11 +136,19 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
     
     rk3_substep!(model, Δt, γ³, ζ³)
 
-    calculate_pressure_correction!(model, third_stage_Δt)
-    pressure_correct_velocities!(model, third_stage_Δt)
+    # This adjustment of the final time-step reduces the accumulation of
+    # round-off error when Δt is added to model.clock.time. Note that we still use 
+    # third_stage_Δt for the substep, pressure correction, and Lagrangian particles step.
+    corrected_third_stage_Δt = tⁿ⁺¹ - model.clock.time
 
     tick!(model.clock, third_stage_Δt)
-    update_state!(model, callbacks; compute_tendencies)
+    model.clock.last_stage_Δt = corrected_third_stage_Δt
+    model.clock.last_Δt = Δt
+
+    calculate_pressure_correction!(model, third_stage_Δt)
+    pressure_correct_velocities!(model, third_stage_Δt)
+  
+    update_state!(model, callbacks; compute_tendencies = true)
     step_lagrangian_particles!(model, third_stage_Δt)
 
     return nothing
diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index f401a8eab3..f810ce5e2f 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -165,7 +165,7 @@ include("turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl")
 include("turbulence_closure_implementations/smagorinsky_lilly.jl")
 include("turbulence_closure_implementations/anisotropic_minimum_dissipation.jl")
 include("turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl")
-include("turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl")
+include("turbulence_closure_implementations/TKEBasedVerticalDiffusivities/TKEBasedVerticalDiffusivities.jl")
 include("turbulence_closure_implementations/ri_based_vertical_diffusivity.jl")
 
 # Special non-abstracted diffusivities:
@@ -173,7 +173,7 @@ include("turbulence_closure_implementations/ri_based_vertical_diffusivity.jl")
 include("turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl")
 include("turbulence_closure_implementations/leith_enstrophy_diffusivity.jl")
 
-using .CATKEVerticalDiffusivities: CATKEVerticalDiffusivity
+using .TKEBasedVerticalDiffusivities: CATKEVerticalDiffusivity, TKEDissipationVerticalDiffusivity
 
 # Miscellaneous utilities
 include("diffusivity_fields.jl")
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
deleted file mode 100644
index acb30cd70a..0000000000
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ /dev/null
@@ -1,282 +0,0 @@
-"""
-    struct TurbulentKineticEnergyEquation{FT}
-
-Parameters for the evolution of oceanic turbulent kinetic energy at the O(1 m) scales associated with
-isotropic turbulence and diapycnal mixing.
-"""
-Base.@kwdef struct TurbulentKineticEnergyEquation{FT}
-    CʰⁱD  :: FT = 0.579 # Dissipation length scale shear coefficient for high Ri
-    CˡᵒD  :: FT = 1.604 # Dissipation length scale shear coefficient for low Ri
-    CᵘⁿD  :: FT = 0.923 # Dissipation length scale shear coefficient for high Ri
-    CᶜD   :: FT = 3.254 # Dissipation length scale convecting layer coefficient
-    CᵉD   :: FT = 0.0   # Dissipation length scale penetration layer coefficient
-    Cᵂu★  :: FT = 3.179 # Surface shear-driven TKE flux coefficient
-    CᵂwΔ  :: FT = 0.383 # Surface convective TKE flux coefficient
-    Cᵂϵ   :: FT = 1.0   # Dissipative near-bottom TKE flux coefficient
-end
-
-#####
-##### Terms in the turbulent kinetic energy equation, all at cell centers
-#####
-
-# Note special attention paid to averaging the vertical grid spacing correctly
-@inline Δz_νₑ_az_bzᶠᶜᶠ(i, j, k, grid, νₑ, a, b) = ℑxᶠᵃᵃ(i, j, k, grid, νₑ) * ∂zᶠᶜᶠ(i, j, k, grid, a) * 
-                                                  Δzᶠᶜᶠ(i, j, k, grid)     * ∂zᶠᶜᶠ(i, j, k, grid, b)
-
-@inline Δz_νₑ_az_bzᶜᶠᶠ(i, j, k, grid, νₑ, a, b) = ℑyᵃᶠᵃ(i, j, k, grid, νₑ) * ∂zᶜᶠᶠ(i, j, k, grid, a) * 
-                                                  Δzᶜᶠᶠ(i, j, k, grid)     * ∂zᶜᶠᶠ(i, j, k, grid, b)
-
-@inline function shear_production_xᶠᶜᶜ(i, j, k, grid, νₑ, uⁿ, u⁺)
-    Δz_Pxⁿ = ℑbzᵃᵃᶜ(i, j, k, grid, Δz_νₑ_az_bzᶠᶜᶠ, νₑ, uⁿ, u⁺)
-    Δz_Px⁺ = ℑbzᵃᵃᶜ(i, j, k, grid, Δz_νₑ_az_bzᶠᶜᶠ, νₑ, u⁺, u⁺)
-    return (Δz_Pxⁿ + Δz_Px⁺) / (2 * Δzᶠᶜᶜ(i, j, k, grid))
-end
-
-@inline function shear_production_yᶜᶠᶜ(i, j, k, grid, νₑ, vⁿ, v⁺)
-    Δz_Pyⁿ = ℑbzᵃᵃᶜ(i, j, k, grid, Δz_νₑ_az_bzᶜᶠᶠ, νₑ, vⁿ, v⁺)
-    Δz_Py⁺ = ℑbzᵃᵃᶜ(i, j, k, grid, Δz_νₑ_az_bzᶜᶠᶠ, νₑ, v⁺, v⁺)
-    return (Δz_Pyⁿ + Δz_Py⁺) / (2 * Δzᶜᶠᶜ(i, j, k, grid))
-end
-
-@inline function shear_production(i, j, k, grid, νₑ, uⁿ, u⁺, vⁿ, v⁺)
-    # Reconstruct the shear production term in an "approximately conservative" manner
-    # (ie respecting the spatial discretization and using a stencil commensurate with the
-    # loss of mean kinetic energy due to shear production --- but _not_ respecting the 
-    # the temporal discretization. Note that also respecting the temporal discretization, would
-    # require storing the velocity field at n and n+1):
-
-    return ℑxᶜᵃᵃ(i, j, k, grid, shear_production_xᶠᶜᶜ, νₑ, uⁿ, u⁺) +
-           ℑyᵃᶜᵃ(i, j, k, grid, shear_production_yᶜᶠᶜ, νₑ, vⁿ, v⁺)
-end
-
-# To reconstruct buoyancy flux "conservatively" (ie approximately correpsonding to production/destruction
-# of mean potential energy):
-@inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
-    κc = @inbounds diffusivities.κc[i, j, k]
-    N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
-    return - κc * N²
-end
- 
-@inline explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
-    ℑbzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
-
-@inline buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities) =
-    explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
-
-const VITD = VerticallyImplicitTimeDiscretization
-
-@inline function buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE{<:VITD}, velocities, tracers, buoyancy, diffusivities)
-    wb = explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
-
-    # "Patankar trick" for buoyancy production (cf Patankar 1980 or Burchard et al. 2003)
-    # If buoyancy flux is a _sink_ of TKE, we treat it implicitly, and return zero here for
-    # the explicit buoyancy flux.
-    return max(zero(grid), wb)
-end
-
-@inline dissipation(i, j, k, grid, closure::FlavorOfCATKE{<:VITD}, args...) = zero(grid)
-
-@inline function dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers,
-                                             buoyancy, surface_buoyancy_flux)
-
-    # Convective dissipation length
-    Cᶜ = closure.turbulent_kinetic_energy_equation.CᶜD
-    Cᵉ = closure.turbulent_kinetic_energy_equation.CᵉD
-    Cˢᵖ = closure.mixing_length.Cˢᵖ
-    Jᵇ = surface_buoyancy_flux
-    ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᵖ, velocities, tracers, buoyancy, Jᵇ)
-
-    # "Stable" dissipation length
-    Cˡᵒ = closure.turbulent_kinetic_energy_equation.CˡᵒD
-    Cʰⁱ = closure.turbulent_kinetic_energy_equation.CʰⁱD
-    Cᵘⁿ = closure.turbulent_kinetic_energy_equation.CᵘⁿD
-    σᴰ = stability_functionᶜᶜᶜ(i, j, k, grid, closure, Cᵘⁿ, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
-    ℓ★ = stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
-    ℓ★ = ℓ★ / σᴰ
-
-    # Dissipation length
-    ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
-    ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
-    ℓᴰ = max(ℓ★, ℓʰ)
-
-    H = total_depthᶜᶜᵃ(i, j, grid)
-    return min(H, ℓᴰ)
-end
-
-@inline function dissipation_rate(i, j, k, grid, closure::FlavorOfCATKE,
-                                  velocities, tracers, buoyancy, diffusivities)
-
-    ℓᴰ = dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Jᵇ)
-    e = tracers.e
-    FT = eltype(grid)
-    eᵢ = @inbounds e[i, j, k]
-    
-    # Note:
-    #   Because   ∂t e + ⋯ = ⋯ + L e = ⋯ - ϵ,
-    #
-    #   then      L e = - ϵ
-    #                 = - Cᴰ e³² / ℓ
-    #
-    #   and thus    L = - Cᴰ √e / ℓ .
-
-    ω_numerical = 1 / closure.negative_turbulent_kinetic_energy_damping_time_scale
-    ω_physical = sqrt(abs(eᵢ)) / ℓᴰ
-
-    return ifelse(eᵢ < 0, ω_numerical, ω_physical)
-end
-
-# Fallbacks for explicit time discretization
-@inline function dissipation(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, args...)
-    eᵢ = @inbounds tracers.e[i, j, k]
-    ω = dissipation_rate(i, j, k, grid, closure, velocities, tracers, args...)
-    return ω * eᵢ
-end
-
-#####
-##### TKE top boundary condition
-#####
-
-""" Compute the flux of TKE through the surface / top boundary. """
-@inline function top_tke_flux(i, j, grid, clock, fields, parameters, closure::FlavorOfCATKE, buoyancy)
-    closure = getclosure(i, j, closure)
-
-    top_tracer_bcs = parameters.top_tracer_boundary_conditions
-    top_velocity_bcs = parameters.top_velocity_boundary_conditions
-    tke_parameters = closure.turbulent_kinetic_energy_equation
-
-    return _top_tke_flux(i, j, grid, clock, fields, tke_parameters, closure,
-                         buoyancy, top_tracer_bcs, top_velocity_bcs)
-end
-
-""" Compute the flux of TKE through the surface / top boundary. """
-@inline top_tke_flux(i, j, grid, clock, fields, parameters, closure, buoyancy) = zero(grid)
-
-@inline top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple::Tuple{<:Any}, buoyancy) =
-    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[1], buoyancy)
-
-@inline top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple::Tuple{<:Any, <:Any}, buoyancy) =
-    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[1], buoyancy) + 
-    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[2], buoyancy)
-
-@inline top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple::Tuple{<:Any, <:Any, <:Any}, buoyancy) =
-    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[1], buoyancy) + 
-    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[2], buoyancy) + 
-    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[3], buoyancy)
-
-@inline function _top_tke_flux(i, j, grid, clock, fields,
-                               tke::TurbulentKineticEnergyEquation, closure::CATKEVD,
-                               buoyancy, top_tracer_bcs, top_velocity_bcs)
-
-    wΔ³ = top_convective_turbulent_velocity_cubed(i, j, grid, clock, fields, buoyancy, top_tracer_bcs)
-    u★ = friction_velocity(i, j, grid, clock, fields, top_velocity_bcs)
-
-    Cᵂu★ = tke.Cᵂu★
-    CᵂwΔ = tke.CᵂwΔ
-
-    return - Cᵂu★ * u★^3 - CᵂwΔ * wΔ³
-end
-
-""" Computes the friction velocity u★ based on fluxes of u and v. """
-@inline function friction_velocity(i, j, grid, clock, fields, velocity_bcs)
-    τx = getbc(velocity_bcs.u, i, j, grid, clock, fields) 
-    τy = getbc(velocity_bcs.v, i, j, grid, clock, fields) 
-    return sqrt(sqrt(τx^2 + τy^2))
-end
-
-""" Computes the convective velocity w★. """
-@inline function top_convective_turbulent_velocity_cubed(i, j, grid, clock, fields, buoyancy, tracer_bcs)
-    Jᵇ = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, fields)
-    Δz = Δzᶜᶜᶜ(i, j, grid.Nz, grid)
-    return clip(Jᵇ) * Δz   
-end
-
-struct TKETopBoundaryConditionParameters{C, U}
-    top_tracer_boundary_conditions :: C
-    top_velocity_boundary_conditions :: U
-end
-
-@inline Adapt.adapt_structure(to, p::TKETopBoundaryConditionParameters) =
-    TKETopBoundaryConditionParameters(adapt(to, p.top_tracer_boundary_conditions),
-                                      adapt(to, p.top_velocity_boundary_conditions))
-
-@inline on_architecture(to, p::TKETopBoundaryConditionParameters) =
-    TKETopBoundaryConditionParameters(on_architecture(to, p.top_tracer_boundary_conditions),
-                                      on_architecture(to, p.top_velocity_boundary_conditions))
-
-
-using Oceananigans.BoundaryConditions: Flux
-const TKEBoundaryFunction = DiscreteBoundaryFunction{<:TKETopBoundaryConditionParameters}
-const TKEBoundaryCondition = BoundaryCondition{<:Flux, <:TKEBoundaryFunction}
-
-@inline getbc(bc::TKEBoundaryCondition, i::Integer, j::Integer, grid::AbstractGrid, clock, fields, clo, buoyancy) =
-    bc.condition.func(i, j, grid, clock, fields, bc.condition.parameters, clo, buoyancy)
-
-@inline getbc(bc::TKEBoundaryCondition, i::Integer, j::Integer, k::Integer, grid::AbstractGrid, clock, fields, clo, buoyancy) =
-    bc.condition.func(i, j, k, grid, clock, fields, bc.condition.parameters, clo, buoyancy)
-
-#####
-##### Utilities for model constructors
-#####
-
-""" Infer tracer boundary conditions from user_bcs and tracer_names. """
-function top_tracer_boundary_conditions(grid, tracer_names, user_bcs)
-    default_tracer_bcs = NamedTuple(c => FieldBoundaryConditions(grid, (Center, Center, Center)) for c in tracer_names)
-    bcs = merge(default_tracer_bcs, user_bcs)
-    return NamedTuple(c => bcs[c].top for c in tracer_names)
-end
-
-""" Infer velocity boundary conditions from `user_bcs` and `tracer_names`. """
-function top_velocity_boundary_conditions(grid, user_bcs)
-    default_top_bc = default_prognostic_bc(topology(grid, 3)(), Center(), DefaultBoundaryCondition())
-
-    user_bc_names = keys(user_bcs)
-    u_top_bc = :u ∈ user_bc_names ? user_bcs.u.top : default_top_bc
-    v_top_bc = :v ∈ user_bc_names ? user_bcs.v.top : default_top_bc
-
-    return (u=u_top_bc, v=v_top_bc)
-end
-
-""" Add TKE boundary conditions specific to `CATKEVerticalDiffusivity`. """
-function add_closure_specific_boundary_conditions(closure::FlavorOfCATKE,
-                                                  user_bcs,
-                                                  grid,
-                                                  tracer_names,
-                                                  buoyancy)
-
-    top_tracer_bcs = top_tracer_boundary_conditions(grid, tracer_names, user_bcs)
-    top_velocity_bcs = top_velocity_boundary_conditions(grid, user_bcs)
-
-    parameters = TKETopBoundaryConditionParameters(top_tracer_bcs, top_velocity_bcs)
-
-    top_tke_bc = FluxBoundaryCondition(top_tke_flux, discrete_form=true, parameters=parameters)
-
-    if :e ∈ keys(user_bcs)
-        e_bcs = user_bcs[:e]
-        
-        tke_bcs = FieldBoundaryConditions(grid, (Center, Center, Center),
-                                          top = top_tke_bc,
-                                          bottom = e_bcs.bottom,
-                                          north = e_bcs.north,
-                                          south = e_bcs.south,
-                                          east = e_bcs.east,
-                                          west = e_bcs.west)
-    else
-        tke_bcs = FieldBoundaryConditions(grid, (Center, Center, Center), top=top_tke_bc)
-    end
-
-    new_boundary_conditions = merge(user_bcs, (; e = tke_bcs))
-
-    return new_boundary_conditions
-end
-
-Base.summary(::TurbulentKineticEnergyEquation) = "CATKEVerticalDiffusivities.TurbulentKineticEnergyEquation"
-Base.show(io::IO, tke::TurbulentKineticEnergyEquation) =
-    print(io, "CATKEVerticalDiffusivities.TurbulentKineticEnergyEquation parameters:", '\n',
-              "├── CʰⁱD: ", tke.CʰⁱD, '\n',
-              "├── CˡᵒD: ", tke.CˡᵒD, '\n',
-              "├── CᵘⁿD: ", tke.CᵘⁿD, '\n',
-              "├── CᶜD:  ", tke.CᶜD,  '\n',
-              "├── CᵉD:  ", tke.CᵉD,  '\n',
-              "├── Cᵂu★: ", tke.Cᵂu★, '\n',
-              "└── CᵂwΔ: ", tke.CᵂwΔ)
-
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/TKEBasedVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/TKEBasedVerticalDiffusivities.jl
new file mode 100644
index 0000000000..77d0d465fb
--- /dev/null
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/TKEBasedVerticalDiffusivities.jl
@@ -0,0 +1,186 @@
+module TKEBasedVerticalDiffusivities
+
+using Adapt
+using CUDA
+using KernelAbstractions: @kernel, @index
+
+using Oceananigans.Architectures
+using Oceananigans.Grids
+using Oceananigans.Utils
+using Oceananigans.Units
+using Oceananigans.Fields
+using Oceananigans.Operators
+
+using Oceananigans.Utils: prettysummary
+using Oceananigans.Grids: peripheral_node, inactive_node, inactive_cell
+using Oceananigans.Fields: ZeroField
+using Oceananigans.BoundaryConditions: default_prognostic_bc, DefaultBoundaryCondition
+using Oceananigans.BoundaryConditions: BoundaryCondition, FieldBoundaryConditions
+using Oceananigans.BoundaryConditions: DiscreteBoundaryFunction, FluxBoundaryCondition
+using Oceananigans.BuoyancyModels: ∂z_b, top_buoyancy_flux
+using Oceananigans.Grids: inactive_cell
+
+using Oceananigans.TurbulenceClosures:
+    getclosure,
+    time_discretization,
+    AbstractScalarDiffusivity,
+    VerticallyImplicitTimeDiscretization,
+    VerticalFormulation
+    
+import Oceananigans.BoundaryConditions: getbc
+import Oceananigans.Utils: with_tracers
+import Oceananigans.TurbulenceClosures:
+    validate_closure,
+    shear_production,
+    buoyancy_flux,
+    dissipation,
+    add_closure_specific_boundary_conditions,
+    compute_diffusivities!,
+    DiffusivityFields,
+    implicit_linear_coefficient,
+    viscosity,
+    diffusivity,
+    viscosity_location,
+    diffusivity_location,
+    diffusive_flux_x,
+    diffusive_flux_y,
+    diffusive_flux_z
+
+const c = Center()
+const f = Face()
+const VITD = VerticallyImplicitTimeDiscretization
+
+@inline ϕ²(i, j, k, grid, ϕ, args...) = ϕ(i, j, k, grid, args...)^2
+
+@inline function shearᶜᶜᶠ(i, j, k, grid, u, v)
+    ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ϕ², ∂zᶠᶜᶠ, u)
+    ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ϕ², ∂zᶜᶠᶠ, v)
+    S² = ∂z_u² + ∂z_v²
+    return S²
+end
+
+@inline function shearᶜᶜᶜ(i, j, k, grid, u, v)
+    ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ℑbzᵃᵃᶜ, ϕ², ∂zᶠᶜᶠ, u)
+    ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ℑbzᵃᵃᶜ, ϕ², ∂zᶜᶠᶠ, v)
+    S² = ∂z_u² + ∂z_v²
+    return S²
+end
+
+@inline Riᶜᶜᶜ(i, j, k, grid, velocities, tracers, buoyancy) =
+    ℑbzᵃᵃᶜ(i, j, k, grid, Riᶜᶜᶠ, velocities, tracers, buoyancy)
+
+@inline function Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
+    u = velocities.u
+    v = velocities.v
+    S² = shearᶜᶜᶠ(i, j, k, grid, u, v)
+    N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
+    Ri = N² / S²
+    return ifelse(N² == 0, zero(grid), Ri)
+end
+
+# @inline ℑbzᵃᵃᶜ(i, j, k, grid, fᵃᵃᶠ, args...) = ℑzᵃᵃᶜ(i, j, k, grid, fᵃᵃᶠ, args...)
+
+# A particular kind of reconstruction that ignores peripheral nodes
+@inline function ℑbzᵃᵃᶜ(i, j, k, grid, fᵃᵃᶠ, args...)
+    k⁺ = k + 1
+    k⁻ = k
+
+    f⁺ = fᵃᵃᶠ(i, j, k⁺, grid, args...)
+    f⁻ = fᵃᵃᶠ(i, j, k⁻, grid, args...)
+
+    p⁺ = peripheral_node(i, j, k⁺, grid, c, c, f)
+    p⁻ = peripheral_node(i, j, k⁻, grid, c, c, f)
+
+    f⁺ = ifelse(p⁺, f⁻, f⁺)
+    f⁻ = ifelse(p⁻, f⁺, f⁻)
+
+    return (f⁺ + f⁻) / 2
+end
+
+# To reconstruct buoyancy flux "conservatively" (ie approximately corresponding to production/destruction
+# of mean potential energy):
+@inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
+    κc = @inbounds diffusivities.κc[i, j, k]
+    N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
+    return - κc * N²
+end
+ 
+@inline explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
+    ℑbzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
+
+# Note special attention paid to averaging the vertical grid spacing correctly
+@inline Δz_νₑ_az_bzᶠᶜᶠ(i, j, k, grid, νₑ, a, b) = ℑxᶠᵃᵃ(i, j, k, grid, νₑ) * ∂zᶠᶜᶠ(i, j, k, grid, a) * 
+                                                  Δzᶠᶜᶠ(i, j, k, grid)     * ∂zᶠᶜᶠ(i, j, k, grid, b)
+
+@inline Δz_νₑ_az_bzᶜᶠᶠ(i, j, k, grid, νₑ, a, b) = ℑyᵃᶠᵃ(i, j, k, grid, νₑ) * ∂zᶜᶠᶠ(i, j, k, grid, a) * 
+                                                  Δzᶜᶠᶠ(i, j, k, grid)     * ∂zᶜᶠᶠ(i, j, k, grid, b)
+
+@inline function shear_production_xᶠᶜᶜ(i, j, k, grid, νₑ, uⁿ, u⁺)
+    Δz_Pxⁿ = ℑbzᵃᵃᶜ(i, j, k, grid, Δz_νₑ_az_bzᶠᶜᶠ, νₑ, uⁿ, u⁺)
+    Δz_Px⁺ = ℑbzᵃᵃᶜ(i, j, k, grid, Δz_νₑ_az_bzᶠᶜᶠ, νₑ, u⁺, u⁺)
+    return (Δz_Pxⁿ + Δz_Px⁺) / (2 * Δzᶠᶜᶜ(i, j, k, grid))
+end
+
+@inline function shear_production_yᶜᶠᶜ(i, j, k, grid, νₑ, vⁿ, v⁺)
+    Δz_Pyⁿ = ℑbzᵃᵃᶜ(i, j, k, grid, Δz_νₑ_az_bzᶜᶠᶠ, νₑ, vⁿ, v⁺)
+    Δz_Py⁺ = ℑbzᵃᵃᶜ(i, j, k, grid, Δz_νₑ_az_bzᶜᶠᶠ, νₑ, v⁺, v⁺)
+    return (Δz_Pyⁿ + Δz_Py⁺) / (2 * Δzᶜᶠᶜ(i, j, k, grid))
+end
+
+@inline function shear_production(i, j, k, grid, νₑ, uⁿ, u⁺, vⁿ, v⁺)
+    # Reconstruct the shear production term in an "approximately conservative" manner
+    # (ie respecting the spatial discretization and using a stencil commensurate with the
+    # loss of mean kinetic energy due to shear production --- but _not_ respecting the 
+    # the temporal discretization. Note that also respecting the temporal discretization, would
+    # require storing the velocity field at n and n+1):
+
+    return ℑxᶜᵃᵃ(i, j, k, grid, shear_production_xᶠᶜᶜ, νₑ, uⁿ, u⁺) +
+           ℑyᵃᶜᵃ(i, j, k, grid, shear_production_yᶜᶠᶜ, νₑ, vⁿ, v⁺)
+end
+
+@inline function turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)
+    eᵢ = @inbounds e[i, j, k]
+    eᵐⁱⁿ = closure.minimum_tke
+    return sqrt(max(eᵐⁱⁿ, eᵢ))
+end
+
+@inline function mask_diffusivity(i, j, k, grid, κ★)
+    on_periphery = peripheral_node(i, j, k, grid, c, c, f)
+    within_inactive = inactive_node(i, j, k, grid, c, c, f)
+    nan = convert(eltype(grid), NaN)
+    return ifelse(on_periphery, zero(grid), ifelse(within_inactive, nan, κ★))
+end
+
+@inline clip(x) = max(zero(x), x)
+
+function get_time_step(closure_array::AbstractArray)
+    # assume they are all the same
+    closure = CUDA.@allowscalar closure_array[1, 1]
+    return get_time_step(closure)
+end
+
+include("tke_top_boundary_condition.jl")
+
+include("catke_vertical_diffusivity.jl")
+include("catke_mixing_length.jl")
+include("catke_equation.jl")
+include("time_step_catke_equation.jl")
+
+include("tke_dissipation_vertical_diffusivity.jl")
+include("tke_dissipation_stability_functions.jl")
+include("tke_dissipation_equations.jl")
+
+for S in (:CATKEMixingLength,
+          :CATKEEquation,
+          :StratifiedDisplacementScale,
+          :ConstantStabilityFunctions,
+          :VariableStabilityFunctions)
+
+    @eval @inline convert_eltype(::Type{FT}, s::$S) where FT =
+        $S{FT}(; Dict(p => getproperty(s, p) for p in propertynames(s))...)
+
+    @eval @inline convert_eltype(::Type{FT}, s::$S{FT}) where FT = s
+end
+
+end # module
+
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/catke_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/catke_equation.jl
new file mode 100644
index 0000000000..a08fb4b969
--- /dev/null
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/catke_equation.jl
@@ -0,0 +1,167 @@
+"""
+    struct CATKEEquation{FT}
+
+Parameters for the evolution of oceanic turbulent kinetic energy at the O(1 m) scales associated with
+isotropic turbulence and diapycnal mixing.
+"""
+Base.@kwdef struct CATKEEquation{FT}
+    CʰⁱD  :: FT = 0.579 # Dissipation length scale shear coefficient for high Ri
+    CˡᵒD  :: FT = 1.604 # Dissipation length scale shear coefficient for low Ri
+    CᵘⁿD  :: FT = 0.923 # Dissipation length scale shear coefficient for high Ri
+    CᶜD   :: FT = 3.254 # Dissipation length scale convecting layer coefficient
+    CᵉD   :: FT = 0.0   # Dissipation length scale penetration layer coefficient
+    Cᵂu★  :: FT = 3.179 # Surface shear-driven TKE flux coefficient
+    CᵂwΔ  :: FT = 0.383 # Surface convective TKE flux coefficient
+    Cᵂϵ   :: FT = 1.0   # Dissipative near-bottom TKE flux coefficient
+end
+
+#####
+##### Terms in the turbulent kinetic energy equation, all at cell centers
+#####
+
+#=
+@inline buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities) =
+    explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
+
+@inline function buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE{<:VITD}, velocities, tracers, buoyancy, diffusivities)
+    wb = explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
+
+    # "Patankar trick" for buoyancy production (cf Patankar 1980 or Burchard et al. 2003)
+    # If buoyancy flux is a _sink_ of TKE, we treat it implicitly, and return zero here for
+    # the explicit buoyancy flux.
+    return max(zero(grid), wb)
+end
+=#
+
+@inline dissipation(i, j, k, grid, closure::FlavorOfCATKE{<:VITD}, args...) = zero(grid)
+
+@inline function dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers,
+                                             buoyancy, surface_buoyancy_flux)
+
+    # Convective dissipation length
+    Cᶜ = closure.turbulent_kinetic_energy_equation.CᶜD
+    Cᵉ = closure.turbulent_kinetic_energy_equation.CᵉD
+    Cˢᵖ = closure.mixing_length.Cˢᵖ
+    Jᵇ = surface_buoyancy_flux
+    ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᵖ, velocities, tracers, buoyancy, Jᵇ)
+
+    # "Stable" dissipation length
+    Cˡᵒ = closure.turbulent_kinetic_energy_equation.CˡᵒD
+    Cʰⁱ = closure.turbulent_kinetic_energy_equation.CʰⁱD
+    Cᵘⁿ = closure.turbulent_kinetic_energy_equation.CᵘⁿD
+    σᴰ = stability_functionᶜᶜᶜ(i, j, k, grid, closure, Cᵘⁿ, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
+    ℓ★ = stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = ℓ★ / σᴰ
+
+    # Dissipation length
+    ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
+    ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
+    ℓᴰ = max(ℓ★, ℓʰ)
+
+    H = total_depthᶜᶜᵃ(i, j, grid)
+    return min(H, ℓᴰ)
+end
+
+@inline function dissipation_rate(i, j, k, grid, closure::FlavorOfCATKE,
+                                  velocities, tracers, buoyancy, diffusivities)
+
+    ℓᴰ = dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Jᵇ)
+    e = tracers.e
+    FT = eltype(grid)
+    eᵢ = @inbounds e[i, j, k]
+    
+    # Note:
+    #   Because   ∂t e + ⋯ = ⋯ + L e = ⋯ - ϵ,
+    #
+    #   then      L e = - ϵ
+    #                 = - Cᴰ e³² / ℓ
+    #
+    #   and thus    L = - Cᴰ √e / ℓ .
+
+    ω_numerical = 1 / closure.negative_tke_damping_time_scale
+    ω_physical = sqrt(abs(eᵢ)) / ℓᴰ
+
+    return ifelse(eᵢ < 0, ω_numerical, ω_physical)
+end
+
+# Fallbacks for explicit time discretization
+@inline function dissipation(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, args...)
+    eᵢ = @inbounds tracers.e[i, j, k]
+    ω = dissipation_rate(i, j, k, grid, closure, velocities, tracers, args...)
+    return ω * eᵢ
+end
+
+#####
+##### TKE top boundary condition
+#####
+
+@inline function top_tke_flux(i, j, grid, clock, fields, parameters, closure::FlavorOfCATKE, buoyancy)
+    closure = getclosure(i, j, closure)
+
+    top_tracer_bcs = parameters.top_tracer_boundary_conditions
+    top_velocity_bcs = parameters.top_velocity_boundary_conditions
+    tke_parameters = closure.turbulent_kinetic_energy_equation
+
+    return _top_tke_flux(i, j, grid, clock, fields, tke_parameters, closure,
+                         buoyancy, top_tracer_bcs, top_velocity_bcs)
+end
+
+@inline function _top_tke_flux(i, j, grid, clock, fields,
+                               tke::CATKEEquation, closure::CATKEVD,
+                               buoyancy, top_tracer_bcs, top_velocity_bcs)
+
+    wΔ³ = top_convective_turbulent_velocity_cubed(i, j, grid, clock, fields, buoyancy, top_tracer_bcs)
+    u★ = friction_velocity(i, j, grid, clock, fields, top_velocity_bcs)
+
+    Cᵂu★ = tke.Cᵂu★
+    CᵂwΔ = tke.CᵂwΔ
+
+    return - Cᵂu★ * u★^3 - CᵂwΔ * wΔ³
+end
+
+#####
+##### Utilities for model constructors
+#####
+
+""" Add TKE boundary conditions specific to `CATKEVerticalDiffusivity`. """
+function add_closure_specific_boundary_conditions(closure::FlavorOfCATKE,
+                                                  user_bcs,
+                                                  grid,
+                                                  tracer_names,
+                                                  buoyancy)
+
+    top_tracer_bcs = top_tracer_boundary_conditions(grid, tracer_names, user_bcs)
+    top_velocity_bcs = top_velocity_boundary_conditions(grid, user_bcs)
+    parameters = TKETopBoundaryConditionParameters(top_tracer_bcs, top_velocity_bcs)
+    top_tke_bc = FluxBoundaryCondition(top_tke_flux, discrete_form=true, parameters=parameters)
+
+    if :e ∈ keys(user_bcs)
+        e_bcs = user_bcs[:e]
+        
+        tke_bcs = FieldBoundaryConditions(grid, (Center, Center, Center),
+                                          top = top_tke_bc,
+                                          bottom = e_bcs.bottom,
+                                          north = e_bcs.north,
+                                          south = e_bcs.south,
+                                          east = e_bcs.east,
+                                          west = e_bcs.west)
+    else
+        tke_bcs = FieldBoundaryConditions(grid, (Center, Center, Center), top=top_tke_bc)
+    end
+
+    new_boundary_conditions = merge(user_bcs, (; e = tke_bcs))
+
+    return new_boundary_conditions
+end
+
+Base.summary(::CATKEEquation) = "TKEBasedVerticalDiffusivities.CATKEEquation"
+Base.show(io::IO, tke::CATKEEquation) =
+    print(io, "TKEBasedVerticalDiffusivities.CATKEEquation parameters:", '\n',
+              "├── CʰⁱD: ", tke.CʰⁱD, '\n',
+              "├── CˡᵒD: ", tke.CˡᵒD, '\n',
+              "├── CᵘⁿD: ", tke.CᵘⁿD, '\n',
+              "├── CᶜD:  ", tke.CᶜD,  '\n',
+              "├── CᵉD:  ", tke.CᵉD,  '\n',
+              "├── Cᵂu★: ", tke.Cᵂu★, '\n',
+              "└── CᵂwΔ: ", tke.CᵂwΔ)
+
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/catke_mixing_length.jl
similarity index 93%
rename from src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
rename to src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/catke_mixing_length.jl
index 792166a6ec..8b2620f6b0 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/catke_mixing_length.jl
@@ -8,15 +8,15 @@ using ..TurbulenceClosures:
     total_depthᶜᶜᵃ
 
 """
-    struct MixingLength{FT}
+    struct CATKEMixingLength{FT}
 
 Contains mixing length parameters for CATKE vertical diffusivity.
 """
-Base.@kwdef struct MixingLength{FT}
+Base.@kwdef struct CATKEMixingLength{FT}
     Cˢ   :: FT = 1.131  # Surface distance coefficient for shear length scale
     Cᵇ   :: FT = Inf    # Bottom distance coefficient for shear length scale
     Cˢᵖ  :: FT = 0.505  # Sheared convective plume coefficient
-    CRiᵟ :: FT = 0.102  # Stability function width 
+    CRiᵟ :: FT = 1.02   # Stability function width 
     CRi⁰ :: FT = 0.254  # Stability function lower Ri
     Cʰⁱu :: FT = 0.242  # Shear mixing length coefficient for momentum at high Ri
     Cˡᵒu :: FT = 0.361  # Shear mixing length coefficient for momentum at low Ri
@@ -39,22 +39,6 @@ end
 ##### Mixing length
 #####
 
-@inline ϕ²(i, j, k, grid, ϕ, args...) = ϕ(i, j, k, grid, args...)^2
-
-@inline function shearᶜᶜᶠ(i, j, k, grid, u, v)
-    ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ϕ², ∂zᶠᶜᶠ, u)
-    ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ϕ², ∂zᶜᶠᶠ, v)
-    S² = ∂z_u² + ∂z_v²
-    return S²
-end
-
-@inline function shearᶜᶜᶜ(i, j, k, grid, u, v)
-    ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ℑbzᵃᵃᶜ, ϕ², ∂zᶠᶜᶠ, u)
-    ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ℑbzᵃᵃᶜ, ϕ², ∂zᶜᶠᶠ, v)
-    S² = ∂z_u² + ∂z_v²
-    return S²
-end
-
 @inline function stratification_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
     FT = eltype(grid)
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
@@ -292,10 +276,10 @@ end
     return min(H, ℓe)
 end
 
-Base.summary(::MixingLength) = "CATKEVerticalDiffusivities.MixingLength"
+Base.summary(::CATKEMixingLength) = "TKEBasedVerticalDiffusivities.CATKEMixingLength"
 
-Base.show(io::IO, ml::MixingLength) =
-    print(io, "CATKEVerticalDiffusivities.MixingLength parameters:", '\n',
+Base.show(io::IO, ml::CATKEMixingLength) =
+    print(io, "TKEBasedVerticalDiffusivities.CATKEMixingLength parameters:", '\n',
               " ├── Cˢ:   ", ml.Cˢ,   '\n',
               " ├── Cᵇ:   ", ml.Cᵇ,   '\n',
               " ├── Cʰⁱu: ", ml.Cʰⁱu, '\n',
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/catke_vertical_diffusivity.jl
similarity index 69%
rename from src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
rename to src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/catke_vertical_diffusivity.jl
index 80260c284c..2adf895c82 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/catke_vertical_diffusivity.jl
@@ -1,71 +1,3 @@
-module CATKEVerticalDiffusivities
-
-using Adapt
-using KernelAbstractions: @kernel, @index
-
-using Oceananigans.Architectures
-using Oceananigans.Grids
-using Oceananigans.Utils
-using Oceananigans.Units
-using Oceananigans.Fields
-using Oceananigans.Operators
-
-using Oceananigans.Utils: prettysummary
-using Oceananigans.Grids: peripheral_node, inactive_node, inactive_cell
-using Oceananigans.Fields: ZeroField
-using Oceananigans.BoundaryConditions: default_prognostic_bc, DefaultBoundaryCondition
-using Oceananigans.BoundaryConditions: BoundaryCondition, FieldBoundaryConditions
-using Oceananigans.BoundaryConditions: DiscreteBoundaryFunction, FluxBoundaryCondition
-using Oceananigans.BuoyancyModels: ∂z_b, top_buoyancy_flux
-using Oceananigans.Grids: inactive_cell
-
-using Oceananigans.TurbulenceClosures:
-    getclosure,
-    time_discretization,
-    AbstractScalarDiffusivity,
-    VerticallyImplicitTimeDiscretization,
-    VerticalFormulation
-    
-import Oceananigans.BoundaryConditions: getbc
-import Oceananigans.Utils: with_tracers
-import Oceananigans.TurbulenceClosures:
-    validate_closure,
-    shear_production,
-    buoyancy_flux,
-    dissipation,
-    add_closure_specific_boundary_conditions,
-    compute_diffusivities!,
-    DiffusivityFields,
-    implicit_linear_coefficient,
-    viscosity,
-    diffusivity,
-    viscosity_location,
-    diffusivity_location,
-    diffusive_flux_x,
-    diffusive_flux_y,
-    diffusive_flux_z
-
-const c = Center()
-const f = Face()
-
-# @inline ℑbzᵃᵃᶜ(i, j, k, grid, fᵃᵃᶠ, args...) = ℑzᵃᵃᶜ(i, j, k, grid, fᵃᵃᶠ, args...)
-
-# A particular kind of reconstruction that ignores peripheral nodes
-@inline function ℑbzᵃᵃᶜ(i, j, k, grid, fᵃᵃᶠ, args...)
-    k⁺ = k + 1
-    k⁻ = k
-
-    f⁺ = fᵃᵃᶠ(i, j, k⁺, grid, args...)
-    f⁻ = fᵃᵃᶠ(i, j, k⁻, grid, args...)
-
-    p⁺ = peripheral_node(i, j, k⁺, grid, c, c, f)
-    p⁻ = peripheral_node(i, j, k⁻, grid, c, c, f)
-
-    f⁺ = ifelse(p⁺, f⁻, f⁺)
-    f⁻ = ifelse(p⁻, f⁺, f⁻)
-
-    return (f⁺ + f⁻) / 2
-end
 
 struct CATKEVerticalDiffusivity{TD, CL, FT, DT, TKE} <: AbstractScalarDiffusivity{TD, VerticalFormulation, 2}
     mixing_length :: CL
@@ -73,10 +5,10 @@ struct CATKEVerticalDiffusivity{TD, CL, FT, DT, TKE} <: AbstractScalarDiffusivit
     maximum_tracer_diffusivity :: FT
     maximum_tke_diffusivity :: FT
     maximum_viscosity :: FT
-    minimum_turbulent_kinetic_energy :: FT
+    minimum_tke :: FT
     minimum_convective_buoyancy_flux :: FT
-    negative_turbulent_kinetic_energy_damping_time_scale :: FT
-    turbulent_kinetic_energy_time_step :: DT
+    negative_tke_damping_time_scale :: FT
+    tke_time_step :: DT
 end
 
 function CATKEVerticalDiffusivity{TD}(mixing_length::CL,
@@ -84,20 +16,20 @@ function CATKEVerticalDiffusivity{TD}(mixing_length::CL,
                                       maximum_tracer_diffusivity::FT,
                                       maximum_tke_diffusivity::FT,
                                       maximum_viscosity::FT,
-                                      minimum_turbulent_kinetic_energy::FT,
+                                      minimum_tke::FT,
                                       minimum_convective_buoyancy_flux::FT,
-                                      negative_turbulent_kinetic_energy_damping_time_scale::FT, 
-                                      turbulent_kinetic_energy_time_step::DT) where {TD, CL, FT, DT, TKE}
+                                      negative_tke_damping_time_scale::FT, 
+                                      tke_time_step::DT) where {TD, CL, FT, DT, TKE}
 
     return CATKEVerticalDiffusivity{TD, CL, FT, DT, TKE}(mixing_length,
                                                          turbulent_kinetic_energy_equation,
                                                          maximum_tracer_diffusivity,
                                                          maximum_tke_diffusivity,
                                                          maximum_viscosity,
-                                                         minimum_turbulent_kinetic_energy,
+                                                         minimum_tke,
                                                          minimum_convective_buoyancy_flux,
-                                                         negative_turbulent_kinetic_energy_damping_time_scale,
-                                                         turbulent_kinetic_energy_time_step)
+                                                         negative_tke_damping_time_scale,
+                                                         tke_time_step)
 end
 
 CATKEVerticalDiffusivity(FT::DataType; kw...) =
@@ -107,22 +39,18 @@ const CATKEVD{TD} = CATKEVerticalDiffusivity{TD} where TD
 const CATKEVDArray{TD} = AbstractArray{<:CATKEVD{TD}} where TD
 const FlavorOfCATKE{TD} = Union{CATKEVD{TD}, CATKEVDArray{TD}} where TD
 
-include("mixing_length.jl")
-include("turbulent_kinetic_energy_equation.jl")
-include("time_step_turbulent_kinetic_energy.jl")
-
 """
     CATKEVerticalDiffusivity([time_discretization = VerticallyImplicitTimeDiscretization(),
                              FT = Float64;]
-                             mixing_length = MixingLength(),
-                             turbulent_kinetic_energy_equation = TurbulentKineticEnergyEquation(),
+                             mixing_length = CATKEMixingLength(),
+                             turbulent_kinetic_energy_equation = CATKEEquation(),
                              maximum_tracer_diffusivity = Inf,
                              maximum_tke_diffusivity = Inf,
                              maximum_viscosity = Inf,
-                             minimum_turbulent_kinetic_energy = 1e-9,
+                             minimum_tke = 1e-9,
                              minimum_convective_buoyancy_flux = 1e-11,
-                             negative_turbulent_kinetic_energy_damping_time_scale = 1minute,
-                             turbulent_kinetic_energy_time_step = nothing)
+                             negative_tke_damping_time_scale = 1minute,
+                             tke_time_step = nothing)
 
 Return the `CATKEVerticalDiffusivity` turbulence closure for vertical mixing by
 small-scale ocean turbulence based on the prognostic evolution of subgrid
@@ -148,35 +76,35 @@ Keyword arguments
 =================
 
 - `maximum_diffusivity`: Maximum value for tracer, momentum, and TKE diffusivities.
-                        Used to clip the diffusivity when/if CATKE predicts
-                        diffusivities that are too large.
-                        Default: `Inf`.
+                         Used to clip the diffusivity when/if CATKE predicts
+                         diffusivities that are too large.
+                         Default: `Inf`.
 
-- `minimum_turbulent_kinetic_energy`: Minimum value for the turbulent kinetic energy.
-                                    Can be used to model the presence "background" TKE
-                                    levels due to, for example, mixing by breaking internal waves.
-                                    Default: 0.
+- `minimum_tke`: Minimum value for the turbulent kinetic energy.
+                 Can be used to model the presence "background" TKE
+                 levels due to, for example, mixing by breaking internal waves.
+                 Default: 0.
 
-- `negative_turbulent_kinetic_energy_damping_time_scale`: Damping time-scale for spurious negative values of TKE,
-                                                        typically generated by oscillatory errors associated
-                                                        with TKE advection.
-                                                        Default: 1 minute.
+- `negative_tke_damping_time_scale`: Damping time-scale for spurious negative values of TKE,
+                                     typically generated by oscillatory errors associated
+                                     with TKE advection.
+                                     Default: 1 minute.
 
 Note that for numerical stability, it is recommended to either have a relative short
-`negative_turbulent_kinetic_energy_damping_time_scale` or a reasonable
-`minimum_turbulent_kinetic_energy`, or both.
+`negative_tke_damping_time_scale` or a reasonable
+`minimum_tke`, or both.
 """
 function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTimeDiscretization(),
                                   FT = Float64;
-                                  mixing_length = MixingLength(),
-                                  turbulent_kinetic_energy_equation = TurbulentKineticEnergyEquation(),
+                                  mixing_length = CATKEMixingLength(),
+                                  turbulent_kinetic_energy_equation = CATKEEquation(),
                                   maximum_tracer_diffusivity = Inf,
                                   maximum_tke_diffusivity = Inf,
                                   maximum_viscosity = Inf,
-                                  minimum_turbulent_kinetic_energy = 1e-9,
+                                  minimum_tke = 1e-9,
                                   minimum_convective_buoyancy_flux = 1e-11,
-                                  negative_turbulent_kinetic_energy_damping_time_scale = 1minute,
-                                  turbulent_kinetic_energy_time_step = nothing) where TD
+                                  negative_tke_damping_time_scale = 1minute,
+                                  tke_time_step = nothing) where TD
 
     mixing_length = convert_eltype(FT, mixing_length)
     turbulent_kinetic_energy_equation = convert_eltype(FT, turbulent_kinetic_energy_equation)
@@ -186,10 +114,10 @@ function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTi
                                         convert(FT, maximum_tracer_diffusivity),
                                         convert(FT, maximum_tke_diffusivity),
                                         convert(FT, maximum_viscosity),
-                                        convert(FT, minimum_turbulent_kinetic_energy),
+                                        convert(FT, minimum_tke),
                                         convert(FT, minimum_convective_buoyancy_flux),
-                                        convert(FT, negative_turbulent_kinetic_energy_damping_time_scale),
-                                        turbulent_kinetic_energy_time_step)
+                                        convert(FT, negative_tke_damping_time_scale),
+                                        tke_time_step)
 end
 
 function with_tracers(tracer_names, closure::FlavorOfCATKE)
@@ -213,29 +141,6 @@ catke_first(catke::FlavorOfCATKE, closure2) = true
 catke_first(closure1, closure2) = false
 catke_first(catke1::FlavorOfCATKE, catke2::FlavorOfCATKE) = error("Can't have two CATKEs in one closure tuple.")
 
-#####
-##### Mixing length and TKE equation
-#####
-
-@inline Riᶜᶜᶜ(i, j, k, grid, velocities, tracers, buoyancy) =
-    ℑbzᵃᵃᶜ(i, j, k, grid, Riᶜᶜᶠ, velocities, tracers, buoyancy)
-
-@inline function Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
-    ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ϕ², ∂zᶠᶜᶠ, velocities.u)
-    ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ϕ², ∂zᶜᶠᶠ, velocities.v)
-    N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
-    S² = ∂z_u² + ∂z_v²
-    Ri = N² / S²
-    #return ifelse(N² ≤ 0, zero(grid), Ri)
-    return ifelse(N² == 0, zero(grid), Ri)
-end
-
-for S in (:MixingLength, :TurbulentKineticEnergyEquation)
-    @eval @inline convert_eltype(::Type{FT}, s::$S) where FT =
-        $S{FT}(; Dict(p => getproperty(s, p) for p in propertynames(s))...)
-    @eval @inline convert_eltype(::Type{FT}, s::$S{FT}) where FT = s
-end
-
 #####
 ##### Diffusivities and diffusivity fields utilities
 #####
@@ -270,12 +175,8 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfCATKE)
             _tupled_tracer_diffusivities, _tupled_implicit_linear_coefficients)
 end        
 
-const c = Center()
-const f = Face()
-
 @inline viscosity_location(::FlavorOfCATKE) = (c, c, f)
 @inline diffusivity_location(::FlavorOfCATKE) = (c, c, f)
-@inline clip(x) = max(zero(x), x)
 
 function compute_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; parameters = :xyz)
 
@@ -293,7 +194,7 @@ function compute_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; pa
         # Compute e at the current time:
         #   * update tendency Gⁿ using current and previous velocity field
         #   * use tridiagonal solve to take an implicit step
-        time_step_turbulent_kinetic_energy!(model)
+        time_step_catke_equation!(model)
     end
 
     # Update "previous velocities"
@@ -333,13 +234,6 @@ end
     @inbounds Jᵇ[i, j, 1] = (Jᵇᵢⱼ + ϵ * Jᵇ★) / (1 + ϵ)
 end
 
-@inline function mask_diffusivity(i, j, k, grid, κ★)
-    on_periphery = peripheral_node(i, j, k, grid, c, c, f)
-    within_inactive = inactive_node(i, j, k, grid, c, c, f)
-    nan = convert(eltype(grid), NaN)
-    return ifelse(on_periphery, zero(grid), ifelse(within_inactive, nan, κ★))
-end
-
 @kernel function compute_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy)
     i, j, k = @index(Global, NTuple)
 
@@ -364,12 +258,6 @@ end
     end
 end
 
-@inline function turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)
-    eᵢ = @inbounds e[i, j, k]
-    eᵐⁱⁿ = closure.minimum_turbulent_kinetic_energy
-    return sqrt(max(eᵐⁱⁿ, eᵢ))
-end
-
 @inline function κuᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     w★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
     ℓu = momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
@@ -406,16 +294,16 @@ function Base.summary(closure::CATKEVD)
     return string("CATKEVerticalDiffusivity{$TD}")
 end
 
-function Base.show(io::IO, clo::FlavorOfCATKE)
+function Base.show(io::IO, clo::CATKEVD)
     print(io, summary(clo))
     print(io, '\n')
     print(io, "├── maximum_tracer_diffusivity: ", prettysummary(clo.maximum_tracer_diffusivity), '\n',
               "├── maximum_tke_diffusivity: ", prettysummary(clo.maximum_tke_diffusivity), '\n',
               "├── maximum_viscosity: ", prettysummary(clo.maximum_viscosity), '\n',
-              "├── minimum_turbulent_kinetic_energy: ", prettysummary(clo.minimum_turbulent_kinetic_energy), '\n',
-              "├── negative_turbulent_kinetic_energy_damping_time_scale: ", prettysummary(clo.negative_turbulent_kinetic_energy_damping_time_scale), '\n',
+              "├── minimum_tke: ", prettysummary(clo.minimum_tke), '\n',
+              "├── negative_tke_time_scale: ", prettysummary(clo.negative_tke_damping_time_scale), '\n',
               "├── minimum_convective_buoyancy_flux: ", prettysummary(clo.minimum_convective_buoyancy_flux), '\n',
-              "├── turbulent_kinetic_energy_time_step: ", prettysummary(clo.turbulent_kinetic_energy_time_step), '\n',
+              "├── tke_time_step: ", prettysummary(clo.tke_time_step), '\n',
               "├── mixing_length: ", prettysummary(clo.mixing_length), '\n',
               "│   ├── Cˢ:   ", prettysummary(clo.mixing_length.Cˢ), '\n',
               "│   ├── Cᵇ:   ", prettysummary(clo.mixing_length.Cᵇ), '\n',
@@ -447,5 +335,3 @@ function Base.show(io::IO, clo::FlavorOfCATKE)
               "    └── Cᵂϵ:  ", prettysummary(clo.turbulent_kinetic_energy_equation.Cᵂϵ))
 end
 
-end # module
-
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/time_step_turbulent_kinetic_energy.jl b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/time_step_catke_equation.jl
similarity index 87%
rename from src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/time_step_turbulent_kinetic_energy.jl
rename to src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/time_step_catke_equation.jl
index a52168e4b4..6658bd56f1 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/time_step_turbulent_kinetic_energy.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/time_step_catke_equation.jl
@@ -1,24 +1,24 @@
 using Oceananigans: fields
 using Oceananigans.Advection: div_Uc, U_dot_∇u, U_dot_∇v
 using Oceananigans.Fields: immersed_boundary_condition
-using Oceananigans.Grids: active_interior_map
+using Oceananigans.Grids: retrieve_interior_active_cells_map
 using Oceananigans.BoundaryConditions: apply_x_bcs!, apply_y_bcs!, apply_z_bcs!
 using Oceananigans.TimeSteppers: store_field_tendencies!, ab2_step_field!, implicit_step!
 using Oceananigans.TurbulenceClosures: ∇_dot_qᶜ, immersed_∇_dot_qᶜ, hydrostatic_turbulent_kinetic_energy_tendency
 using CUDA
 
-tke_time_step(closure::CATKEVerticalDiffusivity) = closure.turbulent_kinetic_energy_time_step
+get_time_step(closure::CATKEVerticalDiffusivity) = closure.tke_time_step
 
-function tke_time_step(closure_array::AbstractArray)
-    # assume they are all the same
-    closure = CUDA.@allowscalar closure_array[1, 1]
-    return tke_time_step(closure)
-end
-
-function time_step_turbulent_kinetic_energy!(model)
+function time_step_catke_equation!(model)
 
     # TODO: properly handle closure tuples
-    closure = model.closure
+    if model.closure isa Tuple
+        closure = first(model.closure)
+        diffusivity_fields = first(model.diffusivity_fields)
+    else
+        closure = model.closure
+        diffusivity_fields = model.diffusivity_fields
+    end
 
     e = model.tracers.e
     arch = model.architecture
@@ -26,20 +26,19 @@ function time_step_turbulent_kinetic_energy!(model)
     Gⁿe = model.timestepper.Gⁿ.e
     G⁻e = model.timestepper.G⁻.e
 
-    diffusivity_fields = model.diffusivity_fields
     κe = diffusivity_fields.κe
     Le = diffusivity_fields.Le
     previous_velocities = diffusivity_fields.previous_velocities
     tracer_index = findfirst(k -> k == :e, keys(model.tracers))
     implicit_solver = model.timestepper.implicit_solver
 
-    Δt = model.clock.last_Δt # simulation time-step (for velocities, tracers, etc)
-    Δτ = tke_time_step(closure) # special time-step for turbulent kinetic energy
+    Δt = model.clock.last_Δt
+    Δτ = get_time_step(closure)
 
     if isnothing(Δτ)
         Δτ = Δt
         M = 1
-    else # limit TKE time-step Δτ by Δt.
+    else
         M = ceil(Int, Δt / Δτ) # number of substeps
         Δτ = Δt / M
     end
@@ -69,7 +68,7 @@ function time_step_turbulent_kinetic_energy!(model)
         # previous_clock = (; time=current_time, iteration=previous_iteration)
 
         implicit_step!(e, implicit_solver, closure,
-                       model.diffusivity_fields, Val(tracer_index),
+                       diffusivity_fields, Val(tracer_index),
                        model.clock, Δτ)
     end
 
@@ -98,7 +97,7 @@ end
     wb⁺ = max(zero(grid), wb)
 
     eⁱʲᵏ = @inbounds e[i, j, k]
-    eᵐⁱⁿ = closure_ij.minimum_turbulent_kinetic_energy
+    eᵐⁱⁿ = closure_ij.minimum_tke
     wb⁻_e = wb⁻ / eⁱʲᵏ * (eⁱʲᵏ > eᵐⁱⁿ)
 
     # Treat the divergence of TKE flux at solid bottoms implicitly.
@@ -118,11 +117,11 @@ end
     # Then the contribution of Jᵉ to the implicit flux is
     #
     #       Lᵂ = - Cᵂϵ * √e / Δz.
-    #
+    
     on_bottom = !inactive_cell(i, j, k, grid) & inactive_cell(i, j, k-1, grid)
     Δz = Δzᶜᶜᶜ(i, j, k, grid)
     Cᵂϵ = closure_ij.turbulent_kinetic_energy_equation.Cᵂϵ
-    e⁺ = clip(eⁱʲᵏ) # ensure that eⁱʲᵏ > 0
+    e⁺ = clip(eⁱʲᵏ)
     w★ = sqrt(e⁺)
     div_Jᵉ_e = - on_bottom * Cᵂϵ * w★ / Δz
 
@@ -178,7 +177,7 @@ end
 end
 
 #=
-using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: FlavorOfCATKE
+using Oceananigans.TurbulenceClosures.TKEBasedVerticalDiffusivities: FlavorOfCATKE
 
 @inline tracer_tendency_kernel_function(model::HFSM, name, c, K)                     = compute_hydrostatic_free_surface_Gc!, c, K
 @inline tracer_tendency_kernel_function(model::HFSM, ::Val{:e}, c::FlavorOfCATKE, K) = compute_hydrostatic_free_surface_Ge!, c, K
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_dissipation_equations.jl b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_dissipation_equations.jl
new file mode 100644
index 0000000000..d24bb8d84b
--- /dev/null
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_dissipation_equations.jl
@@ -0,0 +1,299 @@
+using Oceananigans: fields
+using Oceananigans.Advection: div_Uc, U_dot_∇u, U_dot_∇v
+using Oceananigans.Fields: immersed_boundary_condition
+using Oceananigans.Grids: retrieve_interior_active_cells_map
+using Oceananigans.BoundaryConditions: apply_x_bcs!, apply_y_bcs!, apply_z_bcs!
+using Oceananigans.TimeSteppers: store_field_tendencies!, ab2_step_field!, implicit_step!
+using Oceananigans.TurbulenceClosures: ∇_dot_qᶜ, immersed_∇_dot_qᶜ, hydrostatic_turbulent_kinetic_energy_tendency
+using CUDA
+
+Base.@kwdef struct TKEDissipationEquations{FT}
+    Cᵋϵ :: FT = 1.92
+    Cᴾϵ :: FT = 1.44
+    Cᵇϵ :: FT = -0.65
+    Cᵂu★ :: FT = 0.0
+    CᵂwΔ :: FT = 0.0
+    Cᵂα  :: FT = 0.11 # Charnock parameter
+    gravitational_acceleration :: FT = 9.8065
+    minimum_roughness_length :: FT = 1e-4
+end
+
+get_time_step(closure::TKEDissipationVerticalDiffusivity) = closure.tke_dissipation_time_step
+
+function time_step_tke_dissipation_equations!(model)
+
+    # TODO: properly handle closure tuples
+    closure = model.closure
+
+    e = model.tracers.e
+    ϵ = model.tracers.ϵ
+    arch = model.architecture
+    grid = model.grid
+    Gⁿe = model.timestepper.Gⁿ.e
+    G⁻e = model.timestepper.G⁻.e
+    Gⁿϵ = model.timestepper.Gⁿ.ϵ
+    G⁻ϵ = model.timestepper.G⁻.ϵ
+
+    diffusivity_fields = model.diffusivity_fields
+    κe = diffusivity_fields.κe
+    κϵ = diffusivity_fields.κϵ
+    Le = diffusivity_fields.Le
+    Lϵ = diffusivity_fields.Lϵ
+    previous_velocities = diffusivity_fields.previous_velocities
+    e_index = findfirst(k -> k == :e, keys(model.tracers))
+    ϵ_index = findfirst(k -> k == :ϵ, keys(model.tracers))
+    implicit_solver = model.timestepper.implicit_solver
+
+    Δt = model.clock.last_Δt
+    Δτ = get_time_step(closure)
+
+    if isnothing(Δτ)
+        Δτ = Δt
+        M = 1
+    else
+        M = ceil(Int, Δt / Δτ) # number of substeps
+        Δτ = Δt / M
+    end
+
+    FT = eltype(grid)
+
+    for m = 1:M # substep
+        if m == 1 && M != 1
+            χ = convert(FT, -0.5) # Euler step for the first substep
+        else
+            χ = model.timestepper.χ
+        end
+
+        # Compute the linear implicit component of the RHS (diffusivities, L)
+        # and step forward
+        launch!(arch, grid, :xyz,
+                substep_tke_dissipation!,
+                κe, κϵ, Le, Lϵ,
+                grid, closure,
+                model.velocities, previous_velocities, # try this soon: model.velocities, model.velocities,
+                model.tracers, model.buoyancy, diffusivity_fields,
+                Δτ, χ, Gⁿe, G⁻e, Gⁿϵ, G⁻ϵ)
+
+        implicit_step!(e, implicit_solver, closure,
+                       model.diffusivity_fields, Val(e_index),
+                       model.clock, Δτ)
+
+        implicit_step!(ϵ, implicit_solver, closure,
+                       model.diffusivity_fields, Val(ϵ_index),
+                       model.clock, Δτ)
+    end
+
+    return nothing
+end
+
+@kernel function substep_tke_dissipation!(κe, κϵ, Le, Lϵ,
+                                          grid, closure,
+                                          next_velocities, previous_velocities,
+                                          tracers, buoyancy, diffusivities,
+                                          Δτ, χ, slow_Gⁿe, G⁻e, slow_Gⁿϵ, G⁻ϵ)
+
+    i, j, k = @index(Global, NTuple)
+
+    e = tracers.e
+    ϵ = tracers.ϵ
+
+    closure_ij = getclosure(i, j, closure)
+
+    # Compute TKE and dissipation diffusivities
+    κe★ = κeᶜᶜᶠ(i, j, k, grid, closure_ij, next_velocities, tracers, buoyancy)
+    κϵ★ = κϵᶜᶜᶠ(i, j, k, grid, closure_ij, next_velocities, tracers, buoyancy)
+
+    κe★ = mask_diffusivity(i, j, k, grid, κe★)
+    κϵ★ = mask_diffusivity(i, j, k, grid, κϵ★)
+
+    @inbounds κe[i, j, k] = κe★
+    @inbounds κϵ[i, j, k] = κϵ★
+
+    # Compute TKE and dissipation tendencies
+    ϵ★ = dissipationᶜᶜᶜ(i, j, k, grid, closure_ij, tracers, buoyancy)
+    e★ = turbulent_kinetic_energyᶜᶜᶜ(i, j, k, grid, closure_ij, tracers)
+    eⁱʲᵏ = @inbounds e[i, j, k]
+    ϵⁱʲᵏ = @inbounds ϵ[i, j, k]
+
+    # Different destruction time-scales for TKE vs dissipation for numerical reasons
+    ω★  = ϵ★ / e★ # target / physical dissipation time scale
+    ωe⁻ = closure_ij.negative_tke_damping_time_scale
+    ωe  = ifelse(eⁱʲᵏ < 0, ωe⁻, ω★)
+    ωϵ  = ϵⁱʲᵏ / e★
+
+    # Compute additional diagonal component of the linear TKE operator
+    wb = explicit_buoyancy_flux(i, j, k, grid, closure_ij, next_velocities, tracers, buoyancy, diffusivities)
+
+    # Patankar trick for TKE equation
+    wb⁻ = min(wb, zero(grid))
+    wb⁺ = max(wb, zero(grid))
+
+    eᵐⁱⁿ = closure_ij.minimum_tke
+    wb⁻_e = wb⁻ / eⁱʲᵏ * (eⁱʲᵏ > eᵐⁱⁿ)
+
+    # Patankar trick for ϵ-equation
+    Cᵋϵ = closure_ij.tke_dissipation_equations.Cᵋϵ
+    Cᵇϵ = closure_ij.tke_dissipation_equations.Cᵇϵ
+
+    Cᵇϵ_wb⁻ = min(Cᵇϵ * wb, zero(grid))
+    Cᵇϵ_wb⁺ = max(Cᵇϵ * wb, zero(grid))
+
+    # ∂t e = Lⁱ e + ⋯,
+    @inbounds Le[i, j, k] = wb⁻_e - ωe
+    @inbounds Lϵ[i, j, k] = Cᵇϵ_wb⁻ / e★ - Cᵋϵ * ωϵ
+
+    # Compute fast TKE and dissipation RHSs
+    u⁺ = next_velocities.u
+    v⁺ = next_velocities.v
+    uⁿ = previous_velocities.u
+    vⁿ = previous_velocities.v
+    κu = diffusivities.κu
+    Cᴾϵ = closure_ij.tke_dissipation_equations.Cᴾϵ
+
+    # TODO: correctly handle closure / diffusivity tuples
+    # TODO: the shear_production is actually a slow term so we _could_ precompute.
+    P = shear_production(i, j, k, grid, κu, uⁿ, u⁺, vⁿ, v⁺)
+
+    @inbounds begin
+        fast_Gⁿe = P + wb⁺                  # - ϵ (no implicit time stepping for now)
+        fast_Gⁿϵ = ωϵ * (Cᴾϵ * P + Cᵇϵ_wb⁺) 
+    end
+
+    # Advance TKE and store tendency
+    FT = eltype(χ)
+    Δτ = convert(FT, Δτ)
+
+    # See below.
+    α = convert(FT, 1.5) + χ
+    β = convert(FT, 0.5) + χ
+
+    @inbounds begin
+        total_Gⁿe = slow_Gⁿe[i, j, k] + fast_Gⁿe
+        total_Gⁿϵ = slow_Gⁿϵ[i, j, k] + fast_Gⁿϵ
+
+        e[i, j, k] += Δτ * (α * total_Gⁿe - β * G⁻e[i, j, k])
+        ϵ[i, j, k] += Δτ * (α * total_Gⁿϵ - β * G⁻ϵ[i, j, k])
+
+        G⁻e[i, j, k] = total_Gⁿe
+        G⁻ϵ[i, j, k] = total_Gⁿϵ
+    end
+end
+
+@inline function implicit_linear_coefficient(i, j, k, grid, closure::FlavorOfTD{<:VITD}, K, ::Val{id}, args...) where id
+    L = K._tupled_implicit_linear_coefficients[id]
+    return @inbounds L[i, j, k]
+end
+
+#####
+##### TKE top boundary condition
+#####
+
+@inline function top_tke_flux(i, j, grid, clock, fields, parameters, closure::FlavorOfTD, buoyancy)
+    closure = getclosure(i, j, closure)
+
+    top_tracer_bcs = parameters.top_tracer_boundary_conditions
+    top_velocity_bcs = parameters.top_velocity_boundary_conditions
+    tke_dissipation_parameters = closure.tke_dissipation_equations
+
+    return _top_tke_flux(i, j, grid, clock, fields, tke_dissipation_parameters, closure,
+                         buoyancy, top_tracer_bcs, top_velocity_bcs)
+end
+
+@inline function _top_tke_flux(i, j, grid, clock, fields,
+                               parameters::TKEDissipationEquations, closure::TDVD,
+                               buoyancy, top_tracer_bcs, top_velocity_bcs)
+
+    wΔ³ = top_convective_turbulent_velocity_cubed(i, j, grid, clock, fields, buoyancy, top_tracer_bcs)
+    u★ = friction_velocity(i, j, grid, clock, fields, top_velocity_bcs)
+
+    Cᵂu★ = parameters.Cᵂu★
+    CᵂwΔ = parameters.CᵂwΔ
+
+    return - Cᵂu★ * u★^3 #- CᵂwΔ * wΔ³
+end
+
+@inline function top_dissipation_flux(i, j, grid, clock, fields, parameters, closure::FlavorOfTD, buoyancy)
+    closure = getclosure(i, j, closure)
+
+    top_tracer_bcs = parameters.top_tracer_boundary_conditions
+    top_velocity_bcs = parameters.top_velocity_boundary_conditions
+    tke_dissipation_parameters = closure.tke_dissipation_equations
+
+    return _top_dissipation_flux(i, j, grid, clock, fields, tke_dissipation_parameters, closure,
+                                 buoyancy, top_tracer_bcs, top_velocity_bcs)
+end
+
+@inline function _top_dissipation_flux(i, j, grid, clock, fields, parameters::TKEDissipationEquations,
+                                       closure::TDVD, buoyancy, top_tracer_bcs, top_velocity_bcs)
+
+    𝕊u₀ = closure.stability_functions.𝕊u₀
+    σϵ = closure.stability_functions.Cσϵ
+
+    u★ = friction_velocity(i, j, grid, clock, fields, top_velocity_bcs)
+    α = parameters.Cᵂα
+    g = parameters.gravitational_acceleration
+    ℓ_charnock = α * u★^2 / g
+
+    ℓmin = parameters.minimum_roughness_length
+    ℓᵣ = max(ℓmin, ℓ_charnock)
+
+    k = grid.Nz
+    e★ = turbulent_kinetic_energyᶜᶜᶜ(i, j, k, grid, closure, fields)
+    z = znode(i, j, k, grid, c, c, c)
+    d = - z
+
+    return - 𝕊u₀^4 / σϵ * e★^2 / (d + ℓᵣ)
+end
+
+#####
+##### Utilities for model constructors
+#####
+
+""" Add TKE boundary conditions specific to `TKEDissipationVerticalDiffusivity`. """
+function add_closure_specific_boundary_conditions(closure::FlavorOfTD,
+                                                  user_bcs,
+                                                  grid,
+                                                  tracer_names,
+                                                  buoyancy)
+
+    top_tracer_bcs = top_tracer_boundary_conditions(grid, tracer_names, user_bcs)
+    top_velocity_bcs = top_velocity_boundary_conditions(grid, user_bcs)
+    parameters = TKETopBoundaryConditionParameters(top_tracer_bcs, top_velocity_bcs)
+    top_tke_bc = FluxBoundaryCondition(top_tke_flux, discrete_form=true, parameters=parameters)
+
+    top_dissipation_bc = FluxBoundaryCondition(top_dissipation_flux, discrete_form=true, parameters=parameters)
+
+    
+    if :e ∈ keys(user_bcs)
+        e_bcs = user_bcs[:e]
+        
+        tke_bcs = FieldBoundaryConditions(grid, (Center, Center, Center),
+                                          top = top_tke_bc,
+                                          bottom = e_bcs.bottom,
+                                          north = e_bcs.north,
+                                          south = e_bcs.south,
+                                          east = e_bcs.east,
+                                          west = e_bcs.west)
+    else
+        tke_bcs = FieldBoundaryConditions(grid, (Center, Center, Center), top=top_tke_bc)
+    end
+
+    if :ϵ ∈ keys(user_bcs)
+        ϵ_bcs = user_bcs[:ϵ]
+        
+        dissipation_bcs = FieldBoundaryConditions(grid, (Center, Center, Center),
+                                                  top = top_dissipation_bc,
+                                                  bottom = e_bcs.bottom,
+                                                  north = e_bcs.north,
+                                                  south = e_bcs.south,
+                                                  east = e_bcs.east,
+                                                  west = e_bcs.west)
+    else
+        dissipation_bcs = FieldBoundaryConditions(grid, (Center, Center, Center), top=top_dissipation_bc)
+    end
+
+    new_boundary_conditions = merge(user_bcs, (e=tke_bcs, ϵ=dissipation_bcs))
+
+    return new_boundary_conditions
+end
+
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_dissipation_stability_functions.jl b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_dissipation_stability_functions.jl
new file mode 100644
index 0000000000..ad130a2c42
--- /dev/null
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_dissipation_stability_functions.jl
@@ -0,0 +1,292 @@
+abstract type AbstractConstantSchmidtStabilityFunctions end
+
+const ConstantSchmidtStabilityTDVD = TKEDissipationVerticalDiffusivity{<:Any, <:Any, <:AbstractConstantSchmidtStabilityFunctions}
+
+@inline function tke_stability_functionᶜᶜᶠ(i, j, k, grid, closure::ConstantSchmidtStabilityTDVD, args...)
+    Cσe = closure.stability_functions.Cσe
+    𝕊u = momentum_stability_functionᶜᶜᶠ(i, j, k, grid, closure, args...)
+    return 𝕊u / Cσe
+end
+        
+@inline function dissipation_stability_functionᶜᶜᶠ(i, j, k, grid, closure::ConstantSchmidtStabilityTDVD, args...)
+    Cσϵ = closure.stability_functions.Cσϵ
+    𝕊u = momentum_stability_functionᶜᶜᶠ(i, j, k, grid, closure, args...)
+    return 𝕊u / Cσϵ
+end
+
+Base.@kwdef struct ConstantStabilityFunctions{FT} <: AbstractConstantSchmidtStabilityFunctions
+    Cσe :: FT = 1.0
+    Cσϵ :: FT = 1.2
+    Cu₀ :: FT = 0.53 # √3
+    Cc₀ :: FT = 0.53 # √3
+    𝕊u₀ :: FT = 0.53 # √3
+end
+
+Base.summary(s::ConstantStabilityFunctions{FT}) where FT = "ConstantStabilityFunctions{$FT}"
+
+summarize_stability_functions(s::ConstantStabilityFunctions{FT}, prefix="", sep="│   ") where FT =
+    string(prefix, "ConstantStabilityFunctions{$FT}:", '\n',
+           "    ├── 𝕊u₀: ", prettysummary(s.𝕊u₀), '\n',
+           "    ├── Cσe: ", prettysummary(s.Cσe), '\n',
+           "    ├── Cσϵ: ", prettysummary(s.Cσϵ), '\n',
+           "    ├── Cu₀: ", prettysummary(s.Cu₀), '\n',
+           "    └── Cc₀: ", prettysummary(s.Cc₀))
+
+const ConstantStabilityTDVD = TKEDissipationVerticalDiffusivity{<:Any, <:Any, <:ConstantStabilityFunctions}
+
+@inline momentum_stability_functionᶜᶜᶠ(i, j, k, grid, c::ConstantStabilityTDVD, args...) = c.stability_functions.Cu₀
+@inline   tracer_stability_functionᶜᶜᶠ(i, j, k, grid, c::ConstantStabilityTDVD, args...) = c.stability_functions.Cc₀
+
+struct VariableStabilityFunctions{FT} <: AbstractConstantSchmidtStabilityFunctions
+    Cσe :: FT
+    Cσϵ :: FT
+    Cu₀ :: FT
+    Cu₁ :: FT
+    Cu₂ :: FT
+    Cc₀ :: FT
+    Cc₁ :: FT
+    Cc₂ :: FT
+    Cd₀ :: FT
+    Cd₁ :: FT
+    Cd₂ :: FT
+    Cd₃ :: FT
+    Cd₄ :: FT
+    Cd₅ :: FT
+    𝕊u₀ :: FT
+end
+
+function VariableStabilityFunctions(FT=Float64; 
+                                    Cσe = 1.0,
+                                    Cσϵ = 1.2,
+                                    Cu₀ = 0.1067,
+                                    Cu₁ = 0.0173,
+                                    Cu₂ = -0.0001205,
+                                    Cc₀ = 0.1120,
+                                    Cc₁ = 0.003766,
+                                    Cc₂ = 0.0008871,
+                                    Cd₀ = 1.0,
+                                    Cd₁ = 0.2398,
+                                    Cd₂ = 0.02872,
+                                    Cd₃ = 0.005154,
+                                    Cd₄ = 0.006930,
+                                    Cd₅ = -0.0003372,
+                                    𝕊u₀ = nothing)
+
+    if isnothing(𝕊u₀)
+        # Compute 𝕊u₀ for the logarithmic boundary layer where production
+        # balances dissipation. For more information see the discussion
+        # surrounding equation (13) in Umlauf and Burchard (2003).
+        a = Cd₅ - Cu₂
+        b = Cd₂ - Cu₀
+        c = Cd₀
+        𝕊u₀ = (2a / (-b - sqrt(b^2 - 4a * c)))^(1/4)
+    end
+
+    return VariableStabilityFunctions(convert(FT, Cσe),   
+                                      convert(FT, Cσϵ),
+                                      convert(FT, Cu₀),
+                                      convert(FT, Cu₁),
+                                      convert(FT, Cu₂),
+                                      convert(FT, Cc₀),
+                                      convert(FT, Cc₁),
+                                      convert(FT, Cc₂),
+                                      convert(FT, Cd₀),
+                                      convert(FT, Cd₁),
+                                      convert(FT, Cd₂),
+                                      convert(FT, Cd₃),
+                                      convert(FT, Cd₄),
+                                      convert(FT, Cd₅),
+                                      convert(FT, 𝕊u₀))
+end
+
+Base.summary(s::VariableStabilityFunctions{FT}) where FT = "VariableStabilityFunctions{$FT}"
+
+summarize_stability_functions(s::VariableStabilityFunctions{FT}, prefix="", sep="") where FT =
+    string("VariableStabilityFunctions{$FT}:", '\n',
+           "    ├── Cσe: ", prettysummary(s.Cσe), '\n',
+           "    ├── Cσϵ: ", prettysummary(s.Cσϵ), '\n',
+           "    ├── Cu₀: ", prettysummary(s.Cu₀), '\n',
+           "    ├── Cu₁: ", prettysummary(s.Cu₁), '\n',
+           "    ├── Cu₂: ", prettysummary(s.Cu₂), '\n',
+           "    ├── Cc₀: ", prettysummary(s.Cc₀), '\n',
+           "    ├── Cc₁: ", prettysummary(s.Cc₁), '\n',
+           "    ├── Cc₂: ", prettysummary(s.Cc₂), '\n',
+           "    ├── Cd₀: ", prettysummary(s.Cd₀), '\n',
+           "    ├── Cd₁: ", prettysummary(s.Cd₁), '\n',
+           "    ├── Cd₂: ", prettysummary(s.Cd₂), '\n',
+           "    ├── Cd₃: ", prettysummary(s.Cd₃), '\n',
+           "    ├── Cd₄: ", prettysummary(s.Cd₄), '\n',
+           "    └── Cd₅: ", prettysummary(s.Cd₅))
+
+@inline function square_time_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers, buoyancy)
+    e★ = turbulent_kinetic_energyᶜᶜᶜ(i, j, k, grid, closure, tracers)
+    ϵ★ = dissipationᶜᶜᶜ(i, j, k, grid, closure, tracers, buoyancy)
+    return e★^2 / ϵ★^2
+end
+
+@inline function shear_numberᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy)
+    τ² = ℑzᵃᵃᶠ(i, j, k, grid, square_time_scaleᶜᶜᶜ, closure, tracers, buoyancy)
+    S² = shearᶜᶜᶠ(i, j, k, grid, velocities.u, velocities.v)
+    return τ² * S²
+end
+
+@inline function stratification_numberᶜᶜᶠ(i, j, k, grid, closure, tracers, buoyancy)
+    τ² = ℑzᵃᵃᶠ(i, j, k, grid, square_time_scaleᶜᶜᶜ, closure, tracers, buoyancy)
+    N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
+    return τ² * N²
+end
+
+@inline maximum_stratification_number(closure) = 1e10 # ?
+
+"""
+Based on an argument for free convection, assuming a balance between
+buoyancy production and dissipation.
+
+See Umlauf and Burchard (2005) equation A.22.
+
+Note that _another_ condition could arise depending on the time discretization,
+as discussed in the text surrounding equation 45-46 in Umlauf and Buchard (2005).
+"""
+@inline function minimum_stratification_number(closure)
+    m₀ = closure.stability_functions.Cc₀
+    m₁ = closure.stability_functions.Cc₁
+    m₂ = closure.stability_functions.Cc₂
+
+    d₀ = closure.stability_functions.Cd₀
+    d₁ = closure.stability_functions.Cd₁
+    d₂ = closure.stability_functions.Cd₂
+    d₃ = closure.stability_functions.Cd₃
+    d₄ = closure.stability_functions.Cd₄
+    d₅ = closure.stability_functions.Cd₅
+
+    a = d₄ + m₁
+    b = d₁ + m₀
+    c = d₀
+
+    αᴺmin = (- b + sqrt(b^2 - 4a*c)) / 2a
+
+    # Reduce by the "safety factor"
+    ϵ = closure.minimum_stratification_number_safety_factor
+    αᴺmin *= ϵ
+
+    return αᴺmin
+end
+
+@inline minimum_shear_number(closure::FlavorOfTD) = zero(eltype(closure))
+
+"""
+Based on the condition that shear aniostropy must increase.
+
+See Umlauf and Burchard (2005) equation 44.
+"""
+@inline function maximum_shear_number(closure, αᴺ)
+    n₀ = closure.stability_functions.Cu₀
+    n₁ = closure.stability_functions.Cu₁
+    n₂ = closure.stability_functions.Cu₂
+
+    d₀ = closure.stability_functions.Cd₀
+    d₁ = closure.stability_functions.Cd₁
+    d₂ = closure.stability_functions.Cd₂
+    d₃ = closure.stability_functions.Cd₃
+    d₄ = closure.stability_functions.Cd₄
+    d₅ = closure.stability_functions.Cd₅
+
+    ϵ₀ = d₀ * n₀
+    ϵ₁ = d₀ * n₁ + d₁ * n₀
+    ϵ₂ = d₁ * n₁ + d₄ * n₀
+    ϵ₃ = d₄ * n₁
+    ϵ₄ = d₂ * n₀
+    ϵ₅ = d₂ * n₁ + d₃ * n₀
+    ϵ₆ = d₃ * n₁
+
+    num = ϵ₀ + ϵ₁ * αᴺ + ϵ₂ * αᴺ^2 + ϵ₃ * αᴺ^3
+    den = ϵ₄ + ϵ₅ * αᴺ + ϵ₆ * αᴺ^2
+
+    return num / den
+end
+
+const VariableStabilityTDVD = TKEDissipationVerticalDiffusivity{<:Any, <:Any, <:VariableStabilityFunctions}
+
+@inline function momentum_stability_functionᶜᶜᶠ(i, j, k, grid, closure::VariableStabilityTDVD, velocities, tracers, buoyancy)
+    αᴺ = stratification_numberᶜᶜᶠ(i, j, k, grid, closure, tracers, buoyancy)
+    αᴹ = shear_numberᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy)
+
+    αᴺmin = minimum_stratification_number(closure)
+    αᴺmax = maximum_stratification_number(closure)
+    αᴺ = clamp(αᴺ, αᴺmin, αᴺmax)
+
+    αᴹmin = minimum_shear_number(closure)
+    αᴹmax = maximum_shear_number(closure, αᴺ)
+    αᴹ = clamp(αᴹ, αᴹmin, αᴹmax)
+
+    𝕊u = momentum_stability_function(closure, αᴺ, αᴹ)
+    return 𝕊u
+end
+
+@inline function momentum_stability_function(closure::VariableStabilityTDVD, αᴺ::Number, αᴹ::Number)
+    Cu₀ = closure.stability_functions.Cu₀
+    Cu₁ = closure.stability_functions.Cu₁
+    Cu₂ = closure.stability_functions.Cu₂
+
+    Cd₀ = closure.stability_functions.Cd₀
+    Cd₁ = closure.stability_functions.Cd₁
+    Cd₂ = closure.stability_functions.Cd₂
+    Cd₃ = closure.stability_functions.Cd₃
+    Cd₄ = closure.stability_functions.Cd₄
+    Cd₅ = closure.stability_functions.Cd₅
+
+    num = Cu₀ +
+          Cu₁ * αᴺ +
+          Cu₂ * αᴹ
+
+    den = Cd₀ + Cd₁ * αᴺ +
+          Cd₂ * αᴹ +
+          Cd₃ * αᴺ * αᴹ +
+          Cd₄ * αᴺ^2 +
+          Cd₅ * αᴹ^2
+
+    return num / den
+end
+
+@inline function tracer_stability_functionᶜᶜᶠ(i, j, k, grid, closure::VariableStabilityTDVD, velocities, tracers, buoyancy)
+    αᴺ = stratification_numberᶜᶜᶠ(i, j, k, grid, closure, tracers, buoyancy)
+    αᴹ = shear_numberᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy)
+
+    αᴺmin = minimum_stratification_number(closure)
+    αᴺmax = maximum_stratification_number(closure)
+    αᴺ = clamp(αᴺ, αᴺmin, αᴺmax)
+
+    αᴹmin = minimum_shear_number(closure)
+    αᴹmax = maximum_shear_number(closure, αᴺ)
+    αᴹ = clamp(αᴹ, αᴹmin, αᴹmax)
+
+    𝕊c = tracer_stability_function(closure, αᴺ, αᴹ)
+    return 𝕊c
+end
+
+@inline function tracer_stability_function(closure::VariableStabilityTDVD, αᴺ::Number, αᴹ::Number)
+    Cc₀ = closure.stability_functions.Cc₀
+    Cc₁ = closure.stability_functions.Cc₁
+    Cc₂ = closure.stability_functions.Cc₂
+
+    Cd₀ = closure.stability_functions.Cd₀
+    Cd₁ = closure.stability_functions.Cd₁
+    Cd₂ = closure.stability_functions.Cd₂
+    Cd₃ = closure.stability_functions.Cd₃
+    Cd₄ = closure.stability_functions.Cd₄
+    Cd₅ = closure.stability_functions.Cd₅
+
+    num = Cc₀ +
+          Cc₁ * αᴺ +
+          Cc₂ * αᴹ
+
+    den = Cd₀ +
+          Cd₁ * αᴺ +
+          Cd₂ * αᴹ +
+          Cd₃ * αᴺ * αᴹ +
+          Cd₄ * αᴺ^2 +
+          Cd₅ * αᴹ^2
+
+    return num / den
+end
+
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_dissipation_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_dissipation_vertical_diffusivity.jl
new file mode 100644
index 0000000000..d94035e507
--- /dev/null
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_dissipation_vertical_diffusivity.jl
@@ -0,0 +1,352 @@
+struct TKEDissipationVerticalDiffusivity{TD, KE, ST, LMIN, FT, DT} <: AbstractScalarDiffusivity{TD, VerticalFormulation, 2}
+    tke_dissipation_equations :: KE
+    stability_functions :: ST
+    minimum_length_scale :: LMIN
+    maximum_tracer_diffusivity :: FT
+    maximum_tke_diffusivity :: FT
+    maximum_dissipation_diffusivity :: FT
+    maximum_viscosity :: FT
+    minimum_tke :: FT
+    minimum_stratification_number_safety_factor :: FT
+    negative_tke_damping_time_scale :: FT
+    tke_dissipation_time_step :: DT
+end
+
+function TKEDissipationVerticalDiffusivity{TD}(tke_dissipation_equations::KE,
+                                               stability_functions::ST,
+                                               minimum_length_scale :: LMIN,
+                                               maximum_tracer_diffusivity::FT,
+                                               maximum_tke_diffusivity::FT,
+                                               maximum_dissipation_diffusivity::FT,
+                                               maximum_viscosity::FT,
+                                               minimum_tke::FT,
+                                               minimum_stratification_number_safety_factor::FT,
+                                               negative_tke_damping_time_scale::FT, 
+                                               tke_dissipation_time_step::DT) where {TD, KE, ST, LMIN, FT, DT}
+
+    return TKEDissipationVerticalDiffusivity{TD, KE, ST, LMIN, FT, DT}(tke_dissipation_equations,
+                                                                       stability_functions,
+                                                                       minimum_length_scale,
+                                                                       maximum_tracer_diffusivity,
+                                                                       maximum_tke_diffusivity,
+                                                                       maximum_dissipation_diffusivity,
+                                                                       maximum_viscosity,
+                                                                       minimum_tke,
+                                                                       minimum_stratification_number_safety_factor,
+                                                                       negative_tke_damping_time_scale,
+                                                                       tke_dissipation_time_step)
+end
+
+TKEDissipationVerticalDiffusivity(FT::DataType; kw...) =
+    TKEDissipationVerticalDiffusivity(VerticallyImplicitTimeDiscretization(), FT; kw...)
+
+const TDVD{TD} = TKEDissipationVerticalDiffusivity{TD} where TD
+const TDVDArray{TD} = AbstractArray{<:TDVD{TD}} where TD
+const FlavorOfTD{TD} = Union{TDVD{TD}, TDVDArray{TD}} where TD
+
+@inline Base.eltype(::TKEDissipationVerticalDiffusivity{<:Any, <:Any, <:Any, <:Any, FT}) where FT = FT
+
+"""
+    TKEDissipationVerticalDiffusivity([time_discretization = VerticallyImplicitTimeDiscretization(),
+                                      FT = Float64;]
+                                      tke_dissipation_equations = TKEDissipationEquations(),
+                                      stability_functions = VariableStabilityFunctions(),
+                                      minimum_length_scale = StratifiedDisplacementScale(),
+                                      maximum_tracer_diffusivity = Inf,
+                                      maximum_tke_diffusivity = Inf,
+                                      maximum_dissipation_diffusivity = Inf,
+                                      maximum_viscosity = Inf,
+                                      minimum_tke = 1e-6,
+                                      minimum_stratification_number_safety_factor = 0.73,
+                                      negative_tke_damping_time_scale = 1minute,
+                                      tke_dissipation_time_step = nothing)
+
+Return the `TKEDissipationVerticalDiffusivity` turbulence closure for vertical mixing by
+microscale ocean turbulence based on the prognostic evolution of two variables: the 
+turbulent kinetic energy (TKE), and the turbulent kinetic energy dissipation.
+Elsewhere this is referred to as "k-ϵ". For more information about k-ϵ, see
+Burchard and Bolding (2001), Umlauf and Buchard (2003), and Umlauf and Burchard (2005).
+
+Arguments
+=========
+
+- `time_discretization`: Either `ExplicitTimeDiscretization()` or `VerticallyImplicitTimeDiscretization()`;
+                         default `VerticallyImplicitTimeDiscretization()`.
+
+- `FT`: Float type; default `Float64`.
+
+
+Keyword arguments
+=================
+
+- `maximum_diffusivity`: Maximum value for tracer, momentum, and TKE diffusivities.
+                         Used to clip the diffusivity when/if
+                         TKEDissipationVerticalDiffusivity predicts diffusivities
+                         that are too large.
+                         Default: `Inf`.
+
+- `minimum_tke`: Minimum value for the turbulent kinetic energy.
+                 Can be used to model the presence "background" TKE
+                 levels due to, for example, mixing by breaking internal waves.
+                 Default: 1e-9.
+
+- `negative_tke_damping_time_scale`: Damping time-scale for spurious negative values of TKE,
+                                     typically generated by oscillatory errors associated
+                                     with TKE advection.
+                                     Default: 1 minute.
+
+Note that for numerical stability, it is recommended to either have a relative short
+`negative_turbulent_kinetic_energy_damping_time_scale` or a reasonable
+`minimum_turbulent_kinetic_energy`, or both.
+"""
+function TKEDissipationVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTimeDiscretization(),
+                                           FT = Float64;
+                                           tke_dissipation_equations = TKEDissipationEquations(),
+                                           stability_functions = VariableStabilityFunctions(),
+                                           minimum_length_scale = StratifiedDisplacementScale(),
+                                           maximum_tracer_diffusivity = Inf,
+                                           maximum_tke_diffusivity = Inf,
+                                           maximum_dissipation_diffusivity = Inf,
+                                           maximum_viscosity = Inf,
+                                           minimum_tke = 1e-6,
+                                           minimum_stratification_number_safety_factor = 0.73,
+                                           negative_tke_damping_time_scale = 1minute,
+                                           tke_dissipation_time_step = nothing) where TD
+
+    stability_functions = convert_eltype(FT, stability_functions)
+
+    return TKEDissipationVerticalDiffusivity{TD}(tke_dissipation_equations,
+                                                 stability_functions,
+                                                 minimum_length_scale,
+                                                 convert(FT, maximum_tracer_diffusivity),
+                                                 convert(FT, maximum_tke_diffusivity),
+                                                 convert(FT, maximum_dissipation_diffusivity),
+                                                 convert(FT, maximum_viscosity),
+                                                 convert(FT, minimum_tke),
+                                                 convert(FT, minimum_stratification_number_safety_factor),
+                                                 convert(FT, negative_tke_damping_time_scale),
+                                                 tke_dissipation_time_step)
+end
+
+function with_tracers(tracer_names, closure::FlavorOfTD)
+    :e ∈ tracer_names && :ϵ ∈ tracer_names ||
+        throw(ArgumentError("Tracers must contain :e and :ϵ to represent turbulent kinetic energy " *
+                            "for `TKEDissipationVerticalDiffusivity`."))
+
+    return closure
+end
+
+#####
+##### Stratified displacement length scale limiter
+#####
+
+Base.@kwdef struct StratifiedDisplacementScale{FT}
+    Cᴺ :: FT = 0.75
+    minimum_buoyancy_frequency :: FT = 1e-14
+end
+
+#####
+##### Diffusivities and diffusivity fields utilities
+#####
+
+function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfTD)
+
+    default_diffusivity_bcs = (κu = FieldBoundaryConditions(grid, (Center, Center, Face)),
+                               κc = FieldBoundaryConditions(grid, (Center, Center, Face)),
+                               κe = FieldBoundaryConditions(grid, (Center, Center, Face)),
+                               κϵ = FieldBoundaryConditions(grid, (Center, Center, Face)))
+
+    bcs = merge(default_diffusivity_bcs, bcs)
+
+    κu = ZFaceField(grid, boundary_conditions=bcs.κu)
+    κc = ZFaceField(grid, boundary_conditions=bcs.κc)
+    κe = ZFaceField(grid, boundary_conditions=bcs.κe)
+    κϵ = ZFaceField(grid, boundary_conditions=bcs.κϵ)
+    Le = CenterField(grid)
+    Lϵ = CenterField(grid)
+
+    # Note: we may be able to avoid using the "previous velocities" in favor of a "fully implicit"
+    # discretization of shear production
+    u⁻ = XFaceField(grid)
+    v⁻ = YFaceField(grid)
+    previous_velocities = (; u=u⁻, v=v⁻)
+
+    # Secret tuple for getting tracer diffusivities with tuple[tracer_index]
+    _tupled_tracer_diffusivities = Dict{Symbol, Any}(name => κc for name in tracer_names)
+    _tupled_tracer_diffusivities[:e] = κe
+    _tupled_tracer_diffusivities[:ϵ] = κϵ
+    _tupled_tracer_diffusivities = NamedTuple(name => _tupled_tracer_diffusivities[name]
+                                              for name in tracer_names)
+
+    _tupled_implicit_linear_coefficients = Dict{Symbol, Any}(name => ZeroField() for name in tracer_names)
+    _tupled_implicit_linear_coefficients[:e] = Le
+    _tupled_implicit_linear_coefficients[:ϵ] = Lϵ
+    _tupled_implicit_linear_coefficients = NamedTuple(name => _tupled_implicit_linear_coefficients[name]
+                                                      for name in tracer_names)
+
+    return (; κu, κc, κe, κϵ, Le, Lϵ, previous_velocities,
+            _tupled_tracer_diffusivities, _tupled_implicit_linear_coefficients)
+end        
+
+@inline viscosity_location(::FlavorOfTD) = (c, c, f)
+@inline diffusivity_location(::FlavorOfTD) = (c, c, f)
+
+function compute_diffusivities!(diffusivities, closure::FlavorOfTD, model; parameters = :xyz)
+
+    arch = model.architecture
+    grid = model.grid
+    velocities = model.velocities
+    tracers = model.tracers
+    buoyancy = model.buoyancy
+    clock = model.clock
+    top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
+
+    if isfinite(model.clock.last_Δt) # Check that we have taken a valid time-step first.
+        # Compute e at the current time:
+        #   * update tendency Gⁿ using current and previous velocity field
+        #   * use tridiagonal solve to take an implicit step
+        time_step_tke_dissipation_equations!(model)
+    end
+
+    # Update "previous velocities"
+    u, v, w = model.velocities
+    u⁻, v⁻ = diffusivities.previous_velocities
+    parent(u⁻) .= parent(u)
+    parent(v⁻) .= parent(v)
+
+    launch!(arch, grid, parameters,
+            compute_TKEDissipation_diffusivities!,
+            diffusivities, grid, closure, velocities, tracers, buoyancy)
+
+    return nothing
+end
+
+@kernel function compute_TKEDissipation_diffusivities!(diffusivities, grid, closure::FlavorOfTD,
+                                                       velocities, tracers, buoyancy)
+    i, j, k = @index(Global, NTuple)
+
+    # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
+    closure_ij = getclosure(i, j, closure)
+
+    # Note: we also compute the TKE diffusivity here for diagnostic purposes, even though it
+    # is recomputed in time_step_turbulent_kinetic_energy.
+    κu★ = κuᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy)
+    κc★ = κcᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy)
+    κe★ = κeᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy)
+    κϵ★ = κϵᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy)
+
+    κu★ = mask_diffusivity(i, j, k, grid, κu★)
+    κc★ = mask_diffusivity(i, j, k, grid, κc★)
+    κe★ = mask_diffusivity(i, j, k, grid, κe★)
+    κϵ★ = mask_diffusivity(i, j, k, grid, κϵ★)
+
+    @inbounds begin
+        diffusivities.κu[i, j, k] = κu★
+        diffusivities.κc[i, j, k] = κc★
+        diffusivities.κe[i, j, k] = κe★
+        diffusivities.κϵ[i, j, k] = κϵ★
+    end
+end
+
+@inline function turbulent_kinetic_energyᶜᶜᶜ(i, j, k, grid, closure, tracers)
+    eᵐⁱⁿ = closure.minimum_tke
+    eⁱʲᵏ = @inbounds tracers.e[i, j, k]
+    return max(eᵐⁱⁿ, eⁱʲᵏ)
+end
+
+@inline max_a_b(i, j, k, grid, a::Number, b, args...) = max(a, b(i, j, k, grid, args...))
+
+@inline maximum_dissipation(i, j, k, grid, closure, tracers, buoyancy) = convert(eltype(grid), Inf)
+
+@inline function minimum_dissipation(i, j, k, grid, closure, tracers, buoyancy)
+    FT = eltype(grid)
+
+    N²min = closure.minimum_length_scale.minimum_buoyancy_frequency
+    N²⁺ = ℑbzᵃᵃᶜ(i, j, k, grid, max_a_b, N²min, ∂z_b, buoyancy, tracers)
+
+    Cᴺ = closure.minimum_length_scale.Cᴺ
+    e★ = turbulent_kinetic_energyᶜᶜᶜ(i, j, k, grid, closure, tracers)
+    ℓst = Cᴺ * sqrt(e★ / N²⁺)
+
+    𝕊u₀ = closure.stability_functions.𝕊u₀
+    ℓmin = min(grid.Lz, ℓst)
+    ϵmin = 𝕊u₀^3 * sqrt(e★)^3 / ℓmin
+
+    another_ϵmin = convert(FT, 1e-12)
+    return max(another_ϵmin, ϵmin)
+end
+
+@inline function dissipationᶜᶜᶜ(i, j, k, grid, closure, tracers, buoyancy)
+    ϵᵐⁱⁿ = minimum_dissipation(i, j, k, grid, closure, tracers, buoyancy)
+    ϵᵐᵃˣ = maximum_dissipation(i, j, k, grid, closure, tracers, buoyancy)
+    ϵⁱʲᵏ = @inbounds tracers.ϵ[i, j, k]
+    return clamp(ϵⁱʲᵏ, ϵᵐⁱⁿ, ϵᵐᵃˣ)
+end
+
+@inline function κuᶜᶜᶠ(i, j, k, grid, closure::TDVD, velocities, tracers, buoyancy)
+    e² = ℑzᵃᵃᶠ(i, j, k, grid, ϕ², turbulent_kinetic_energyᶜᶜᶜ, closure, tracers)
+    ϵ  = ℑzᵃᵃᶠ(i, j, k, grid, dissipationᶜᶜᶜ, closure, tracers, buoyancy)
+    𝕊u = momentum_stability_functionᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy)
+    κu = 𝕊u * e² / ϵ
+    κu_max = closure.maximum_viscosity
+    return min(κu, κu_max)
+end
+
+@inline function κcᶜᶜᶠ(i, j, k, grid, closure::TDVD, velocities, tracers, buoyancy)
+    e² = ℑzᵃᵃᶠ(i, j, k, grid, ϕ², turbulent_kinetic_energyᶜᶜᶜ, closure, tracers)
+    ϵ  = ℑzᵃᵃᶠ(i, j, k, grid, dissipationᶜᶜᶜ, closure, tracers, buoyancy)
+    𝕊c = tracer_stability_functionᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy)
+    κc = 𝕊c * e² / ϵ
+    κc_max = closure.maximum_tracer_diffusivity
+    return min(κc, κc_max)
+end
+
+@inline function κeᶜᶜᶠ(i, j, k, grid, closure::TDVD, velocities, tracers, buoyancy)
+    e² = ℑzᵃᵃᶠ(i, j, k, grid, ϕ², turbulent_kinetic_energyᶜᶜᶜ, closure, tracers)
+    ϵ  = ℑzᵃᵃᶠ(i, j, k, grid, dissipationᶜᶜᶜ, closure, tracers, buoyancy)
+    𝕊e = tke_stability_functionᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy)
+    κe = 𝕊e * e² / ϵ
+    κe_max = closure.maximum_tke_diffusivity
+    return min(κe, κe_max)
+end
+
+@inline function κϵᶜᶜᶠ(i, j, k, grid, closure::TDVD, velocities, tracers, buoyancy)
+    e² = ℑzᵃᵃᶠ(i, j, k, grid, ϕ², turbulent_kinetic_energyᶜᶜᶜ, closure, tracers)
+    ϵ  = ℑzᵃᵃᶠ(i, j, k, grid, dissipationᶜᶜᶜ, closure, tracers, buoyancy)
+    𝕊ϵ = dissipation_stability_functionᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy)
+    κϵ = 𝕊ϵ * e² / ϵ
+    κϵ_max = closure.maximum_dissipation_diffusivity
+    return min(κϵ, κϵ_max)
+end
+
+@inline viscosity(::FlavorOfTD, diffusivities) = diffusivities.κu
+@inline diffusivity(::FlavorOfTD, diffusivities, ::Val{id}) where id = diffusivities._tupled_tracer_diffusivities[id]
+    
+#####
+##### Show
+#####
+
+function Base.summary(closure::TDVD)
+    TD = nameof(typeof(time_discretization(closure)))
+    return string("TKEDissipationVerticalDiffusivity{$TD}")
+end
+
+function Base.show(io::IO, clo::TDVD)
+    print(io, summary(clo))
+    print(io, '\n')
+    print(io, "├── maximum_tracer_diffusivity: ", prettysummary(clo.maximum_tracer_diffusivity), '\n',
+              "├── maximum_tke_diffusivity: ", prettysummary(clo.maximum_tke_diffusivity), '\n',
+              "├── maximum_dissipation_diffusivity: ", prettysummary(clo.maximum_dissipation_diffusivity), '\n',
+              "├── maximum_viscosity: ", prettysummary(clo.maximum_viscosity), '\n',
+              "├── minimum_tke: ", prettysummary(clo.minimum_tke), '\n',
+              "├── negative_tke_damping_time_scale: ", prettysummary(clo.negative_tke_damping_time_scale), '\n',
+              "├── tke_dissipation_time_step: ", prettysummary(clo.tke_dissipation_time_step), '\n',
+              "├── tke_dissipation_equations: ", prettysummary(clo.tke_dissipation_equations), '\n',
+              "│   ├── Cᵋϵ: ", prettysummary(clo.tke_dissipation_equations.Cᵋϵ),  '\n',
+              "│   ├── Cᴾϵ: ", prettysummary(clo.tke_dissipation_equations.Cᴾϵ),  '\n',
+              "│   ├── Cᵇϵ: ", prettysummary(clo.tke_dissipation_equations.Cᵇϵ),  '\n',
+              "│   ├── Cᵂu★: ", prettysummary(clo.tke_dissipation_equations.Cᵂu★), '\n',
+              "│   └── CᵂwΔ: ", prettysummary(clo.tke_dissipation_equations.CᵂwΔ), '\n')
+    print(io, "└── ", summarize_stability_functions(clo.stability_functions), "", "    ")
+end
+
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_top_boundary_condition.jl b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_top_boundary_condition.jl
new file mode 100644
index 0000000000..f67db36c3e
--- /dev/null
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/TKEBasedVerticalDiffusivities/tke_top_boundary_condition.jl
@@ -0,0 +1,91 @@
+using Oceananigans.BoundaryConditions: DiscreteBoundaryFunction, BoundaryCondition, Flux
+
+struct TKETopBoundaryConditionParameters{C, U}
+    top_tracer_boundary_conditions :: C
+    top_velocity_boundary_conditions :: U
+end
+
+const TKEBoundaryFunction = DiscreteBoundaryFunction{<:TKETopBoundaryConditionParameters}
+const TKEBoundaryCondition = BoundaryCondition{<:Flux, <:TKEBoundaryFunction}
+
+@inline Adapt.adapt_structure(to, p::TKETopBoundaryConditionParameters) =
+    TKETopBoundaryConditionParameters(adapt(to, p.top_tracer_boundary_conditions),
+                                      adapt(to, p.top_velocity_boundary_conditions))
+
+@inline on_architecture(to, p::TKETopBoundaryConditionParameters) =
+    TKETopBoundaryConditionParameters(on_architecture(to, p.top_tracer_boundary_conditions),
+                                      on_architecture(to, p.top_velocity_boundary_conditions))
+
+@inline getbc(bc::TKEBoundaryCondition, i::Integer, j::Integer, grid::AbstractGrid, clock, fields, clo, buoyancy) =
+    bc.condition.func(i, j, grid, clock, fields, bc.condition.parameters, clo, buoyancy)
+
+@inline getbc(bc::TKEBoundaryCondition, i::Integer, j::Integer, k::Integer, grid::AbstractGrid, clock, fields, clo, buoyancy) =
+    bc.condition.func(i, j, k, grid, clock, fields, bc.condition.parameters, clo, buoyancy)
+
+"""
+    top_tke_flux(i, j, grid, clock, fields, parameters, closure, buoyancy)
+
+Compute the flux of TKE through the surface / top boundary.
+Designed to be used with TKETopBoundaryConditionParameters in a FluxBoundaryCondition, eg:
+
+```
+top_tracer_bcs = top_tracer_boundary_conditions(grid, tracer_names, user_bcs)
+top_velocity_bcs = top_velocity_boundary_conditions(grid, user_bcs)
+parameters = TKETopBoundaryConditionParameters(top_tracer_bcs, top_velocity_bcs)
+top_tke_bc = FluxBoundaryCondition(top_tke_flux, discrete_form=true, parameters=parameters)
+```
+
+See the implementation in catke_equation.jl.
+"""
+@inline top_tke_flux(i, j, grid, clock, fields, parameters, closure, buoyancy) = zero(grid)
+
+#####
+##### For model constructors
+#####
+
+""" Infer tracer boundary conditions from user_bcs and tracer_names. """
+function top_tracer_boundary_conditions(grid, tracer_names, user_bcs)
+    default_tracer_bcs = NamedTuple(c => FieldBoundaryConditions(grid, (Center, Center, Center)) for c in tracer_names)
+    bcs = merge(default_tracer_bcs, user_bcs)
+    return NamedTuple(c => bcs[c].top for c in tracer_names)
+end
+
+""" Infer velocity boundary conditions from `user_bcs` and `tracer_names`. """
+function top_velocity_boundary_conditions(grid, user_bcs)
+    default_top_bc = default_prognostic_bc(topology(grid, 3)(), Center(), DefaultBoundaryCondition())
+
+    user_bc_names = keys(user_bcs)
+    u_top_bc = :u ∈ user_bc_names ? user_bcs.u.top : default_top_bc
+    v_top_bc = :v ∈ user_bc_names ? user_bcs.v.top : default_top_bc
+
+    return (u=u_top_bc, v=v_top_bc)
+end
+
+""" Computes the friction velocity u★ based on fluxes of u and v. """
+@inline function friction_velocity(i, j, grid, clock, fields, velocity_bcs)
+    FT = eltype(grid)
+    τx = getbc(velocity_bcs.u, i, j, grid, clock, fields) 
+    τy = getbc(velocity_bcs.v, i, j, grid, clock, fields) 
+    return sqrt(sqrt(τx^2 + τy^2))
+end
+
+""" Computes the convective velocity w★. """
+@inline function top_convective_turbulent_velocity_cubed(i, j, grid, clock, fields, buoyancy, tracer_bcs)
+    Jᵇ = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, fields)
+    Δz = Δzᶜᶜᶜ(i, j, grid.Nz, grid)
+    return clip(Jᵇ) * Δz   
+end
+
+@inline top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple::Tuple{<:Any}, buoyancy) =
+    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[1], buoyancy)
+
+@inline top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple::Tuple{<:Any, <:Any}, buoyancy) =
+    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[1], buoyancy) + 
+    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[2], buoyancy)
+
+@inline top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple::Tuple{<:Any, <:Any, <:Any}, buoyancy) =
+    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[1], buoyancy) + 
+    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[2], buoyancy) + 
+    top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[3], buoyancy)
+
+
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index d06d7df1b1..2d4b7bb2eb 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -177,17 +177,17 @@ const f = Face()
 @inline viscosity_location(::FlavorOfRBVD)   = (c, c, f)
 @inline diffusivity_location(::FlavorOfRBVD) = (c, c, f)
 
-@inline viscosity(::FlavorOfRBVD, diffusivities) = diffusivities.κᵘ
-@inline diffusivity(::FlavorOfRBVD, diffusivities, id) = diffusivities.κᶜ
+@inline viscosity(::FlavorOfRBVD, diffusivities) = diffusivities.κu
+@inline diffusivity(::FlavorOfRBVD, diffusivities, id) = diffusivities.κc
 
 with_tracers(tracers, closure::FlavorOfRBVD) = closure
 
 # Note: computing diffusivities at cell centers for now.
 function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfRBVD)
-    κᶜ = Field((Center, Center, Face), grid)
-    κᵘ = Field((Center, Center, Face), grid)
+    κc = Field((Center, Center, Face), grid)
+    κu = Field((Center, Center, Face), grid)
     Ri = Field((Center, Center, Face), grid)
-    return (; κᶜ, κᵘ, Ri)
+    return (; κc, κu, Ri)
 end
 
 function compute_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; parameters = :xyz)
@@ -312,30 +312,30 @@ end
 
     # Shear mixing diffusivity and viscosity
     τ = taper(tapering, Ri, Ri₀, Riᵟ)
-    κᶜ★ = κ₀ * τ
-    κᵘ★ = ν₀ * τ
+    κc★ = κ₀ * τ
+    κu★ = ν₀ * τ
 
     # Previous diffusivities
-    κᶜ = diffusivities.κᶜ
-    κᵘ = diffusivities.κᵘ
+    κc = diffusivities.κc
+    κu = diffusivities.κu
 
     # New diffusivities
-    κᶜ⁺ = κᶜᵃ + κᵉⁿ + κᶜ★
-    κᵘ⁺ = κᵘ★
+    κc⁺ = κᶜᵃ + κᵉⁿ + κc★
+    κu⁺ = κu★
 
     # Limit by specified maximum
-    κᶜ⁺ = min(κᶜ⁺, closure_ij.maximum_diffusivity) 
-    κᵘ⁺ = min(κᵘ⁺, closure_ij.maximum_viscosity) 
+    κc⁺ = min(κc⁺, closure_ij.maximum_diffusivity) 
+    κu⁺ = min(κu⁺, closure_ij.maximum_viscosity) 
 
     # Set to zero on periphery and NaN within inactive region
     on_periphery = peripheral_node(i, j, k, grid, c, c, f)
     within_inactive = inactive_node(i, j, k, grid, c, c, f)
-    κᶜ⁺ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᶜ⁺))
-    κᵘ⁺ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᵘ⁺))
+    κc⁺ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κc⁺))
+    κu⁺ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κu⁺))
 
     # Update by averaging in time
-    @inbounds κᶜ[i, j, k] = (Cᵃᵛ * κᶜ[i, j, k] + κᶜ⁺) / (1 + Cᵃᵛ)
-    @inbounds κᵘ[i, j, k] = (Cᵃᵛ * κᵘ[i, j, k] + κᵘ⁺) / (1 + Cᵃᵛ)
+    @inbounds κc[i, j, k] = (Cᵃᵛ * κc[i, j, k] + κc⁺) / (1 + Cᵃᵛ)
+    @inbounds κu[i, j, k] = (Cᵃᵛ * κu[i, j, k] + κu⁺) / (1 + Cᵃᵛ)
 
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
index 6b6faba472..d17f1eb72e 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
@@ -53,11 +53,15 @@ value of keyword argument `discrete_form`, the constructor expects:
   a `LatitudeLongitudeGrid`.
 
 * `discrete_form = true`:
-  - with `loc = (nothing, nothing, nothing)` (default):
-    functions of `(i, j, k, grid, ℓx, ℓy, ℓz)` with `ℓx`, `ℓy`,
+  - with `loc = (nothing, nothing, nothing)` and `parameters = nothing` (default):
+    functions of `(i, j, k, grid, ℓx, ℓy, ℓz, clock, fields)` with `ℓx`, `ℓy`,
     and `ℓz` either `Face()` or `Center()`.
   - with `loc = (ℓx, ℓy, ℓz)` with `ℓx`, `ℓy`, and `ℓz` either
-    `Face()` or `Center()`: functions of `(i, j, k, grid)`.
+    `Face()` or `Center()` and `parameters = nothing`: functions of `(i, j, k, grid, clock, fields)`.
+  - with `loc = (nothing, nothing, nothing)` and specified `parameters`:
+    functions of `(i, j, k, grid, ℓx, ℓy, ℓz, clock, fields, parameters)`.
+  - with `loc = (ℓx, ℓy, ℓz)` and specified `parameters`:
+    functions of `(i, j, k, grid, clock, fields, parameters)`.
 
 * `parameters`: `NamedTuple` with parameters used by the functions
   that compute viscosity and/or diffusivity; default: `nothing`.
@@ -85,7 +89,7 @@ ScalarDiffusivity{ExplicitTimeDiscretization}(ν=ν (generic function with 1 met
 ```jldoctest ScalarDiffusivity
 julia> using Oceananigans.Grids: znode
 
-julia> @inline function κ(i, j, k, grid, ℓx, ℓy, ℓz)
+julia> @inline function κ(i, j, k, grid, ℓx, ℓy, ℓz, clock, fields)
            z = znode(i, j, k, grid, ℓx, ℓy, ℓz)
            return 2000 * exp(z / depth_scale)
        end
@@ -96,8 +100,8 @@ ScalarDiffusivity{ExplicitTimeDiscretization}(ν=0.0, κ=Oceananigans.Turbulence
 ```
 
 ```jldoctest ScalarDiffusivity
-julia> @inline function another_κ(i, j, k, grid, p)
-           z = znode(i, j, k, grid)
+julia> @inline function another_κ(i, j, k, grid, clock, fields, p)
+           z = znode(i, j, k, grid, Center(), Center(), Face())
            return 2000 * exp(z / p.depth_scale)
        end
 another_κ (generic function with 1 method)
diff --git a/src/TurbulenceClosures/velocity_tracer_gradients.jl b/src/TurbulenceClosures/velocity_tracer_gradients.jl
index 78bedbdb81..be36e6996e 100644
--- a/src/TurbulenceClosures/velocity_tracer_gradients.jl
+++ b/src/TurbulenceClosures/velocity_tracer_gradients.jl
@@ -9,10 +9,10 @@
 
 # Off-diagonal
 @inline ∂x_v(i, j, k, grid, v) = ∂xᶠᶠᶜ(i, j, k, grid, v)
-@inline ∂x_w(i, j, k, grid, w) = ∂xᶠᶜᶜ(i, j, k, grid, w)
+@inline ∂x_w(i, j, k, grid, w) = ∂xᶠᶜᶠ(i, j, k, grid, w)
 
 @inline ∂y_u(i, j, k, grid, u) = ∂yᶠᶠᶜ(i, j, k, grid, u)
-@inline ∂y_w(i, j, k, grid, w) = ∂yᶜᶠᶜ(i, j, k, grid, w)
+@inline ∂y_w(i, j, k, grid, w) = ∂yᶜᶠᶠ(i, j, k, grid, w)
 
 @inline ∂z_u(i, j, k, grid, u) = ∂zᶠᶜᶠ(i, j, k, grid, u)
 @inline ∂z_v(i, j, k, grid, v) = ∂zᶜᶠᶠ(i, j, k, grid, v)
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 5c6ccf506c..361fa11ea6 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -30,13 +30,51 @@ kp = KernelParameters(size, offsets)
 
 # Launch a kernel with indices that range from i=1:8, j=2:7, k=3:6,
 # where i, j, k are the first, second, and third index, respectively:
+
 launch!(arch, grid, kp, kernel!; kernel_args...)
 ```
 
-See the documentation for [`launch!`](@ref).
+See [`launch!`](@ref).
 """
 KernelParameters(size, offsets) = KernelParameters{size, offsets}()
 
+"""
+    KernelParameters(range1, [range2, range3])
+
+Return parameters for launching a kernel of up to three dimensions, where the
+indices spanned by the kernel in each dimension are given by (range1, range2, range3).
+
+Example
+=======
+
+```julia
+kp = KernelParameters(1:4, 0:10)
+
+# Launch a kernel with indices that range from i=1:4, j=0:10,
+# where i, j are the first and second index, respectively.
+launch!(arch, grid, kp, kernel!; kernel_args...)
+```
+
+See the documentation for [`launch!`](@ref).
+"""
+function KernelParameters(r::UnitRange)
+    size = length(r)
+    offset = first(r) - 1
+    return KernelParameters(tuple(size), tuple(offset))
+end
+
+function KernelParameters(r1::UnitRange, r2::UnitRange)
+    size = (length(r1), length(r2))
+    offsets = (first(r1) - 1, first(r2) - 1)
+    return KernelParameters(size, offsets)
+end
+
+function KernelParameters(r1::UnitRange, r2::UnitRange, r3::UnitRange)
+    size = (length(r1), length(r2), length(r3))
+    offsets = (first(r1) - 1, first(r2) - 1, first(r3) - 1)
+    return KernelParameters(size, offsets)
+end
+
 offsets(::KernelParameters{S, O}) where {S, O} = O
 offsets(workspec)  = nothing
 
@@ -133,7 +171,6 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
                               active_cells_map,
                               kwargs...)
     
-    
     !isnothing(loop!) && loop!(kernel_args...)
     
     return nothing
@@ -180,7 +217,7 @@ function configured_kernel(arch, grid, workspec, kernel!;
     offset = offsets(workspec)
 
     if !isnothing(active_cells_map) 
-        workgroup, worksize = active_cells_work_layout(workgroup, worksize, active_cells_map, grid) 
+        workgroup, worksize = active_cells_work_layout(workgroup, worksize, active_cells_map) 
         offset = nothing
 
         # A non active domain! 
diff --git a/src/Utils/schedules.jl b/src/Utils/schedules.jl
index f13749bc59..49302cfe7b 100644
--- a/src/Utils/schedules.jl
+++ b/src/Utils/schedules.jl
@@ -18,8 +18,8 @@ schedule_aligned_time_step(schedule, clock, Δt) = Δt
 function initialize!(schedule::AbstractSchedule, model)
     schedule(model)
 
-    # `return true` indicates that the schedule
-    # "actuates" at initial call.
+    # the default behavior `return true` dictates that by default,
+    # schedules actuate at the initial call.
     return true
 end
 
@@ -47,17 +47,19 @@ on a `interval` of simulation time, as kept by `model.clock`.
 """
 TimeInterval(interval) = TimeInterval(convert(Float64, interval), 0.0, 0)
 
-function initialize!(schedule::TimeInterval, model)
-    schedule.first_actuation_time = model.clock.time
-    schedule(model)
+function initialize!(schedule::TimeInterval, first_actuation_time::Number)
+    schedule.first_actuation_time = first_actuation_time
+    schedule.actuations = 0
     return true
 end
 
+initialize!(schedule::TimeInterval, model) = initialize!(schedule, model.clock.time)
+
 function next_actuation_time(schedule::TimeInterval)
     t₀ = schedule.first_actuation_time
     N = schedule.actuations
     T = schedule.interval
-    return t₀ + N * T
+    return t₀ + (N + 1) * T
 end
 
 function (schedule::TimeInterval)(model)
@@ -67,9 +69,8 @@ function (schedule::TimeInterval)(model)
     if t >= t★
         if schedule.actuations < typemax(Int)
             schedule.actuations += 1
-        else
-            schedule.first_actuation_time = t★
-            schedule.actuations = 1
+        else # re-initialize the schedule to t★
+            initialize!(schedule, t★)
         end
         return true
     else
@@ -187,7 +188,10 @@ end
 
 initialize!(st::SpecifiedTimes, model) = st(model)
 
-align_time_step(schedule::SpecifiedTimes, clock, Δt) = min(Δt, next_actuation_time(schedule) - clock.time)
+function schedule_aligned_time_step(schedule::SpecifiedTimes, clock, Δt)
+    δt = next_actuation_time(schedule) - clock.time
+    return min(Δt, δt)
+end
 
 function specified_times_str(st)
     str_elems = ["$(prettytime(t)), " for t in st.times]
@@ -272,8 +276,9 @@ function (as::OrSchedule)(model)
     return any(actuations)
 end
 
-align_time_step(any_or_all_schedule::Union{OrSchedule, AndSchedule}, clock, Δt) =
-    minimum(align_time_step(clock, Δt, schedule) for schedule in any_or_all_schedule.schedules)
+schedule_aligned_time_step(any_or_all_schedule::Union{OrSchedule, AndSchedule}, clock, Δt) =
+    minimum(schedule_aligned_time_step(schedule, clock, Δt)
+            for schedule in any_or_all_schedule.schedules)
 
 #####
 ##### Show methods
@@ -285,3 +290,4 @@ Base.summary(schedule::SpecifiedTimes) = string("SpecifiedTimes(", specified_tim
 Base.summary(schedule::ConsecutiveIterations) = string("ConsecutiveIterations(",
                                                        summary(schedule.parent), ", ",
                                                        schedule.consecutive_iterations, ")")
+
diff --git a/test/dependencies_for_poisson_solvers.jl b/test/dependencies_for_poisson_solvers.jl
index 24ff92678a..a4a404e034 100644
--- a/test/dependencies_for_poisson_solvers.jl
+++ b/test/dependencies_for_poisson_solvers.jl
@@ -17,15 +17,15 @@ function random_divergent_source_term(grid)
     v_bcs = regularize_field_boundary_conditions(default_bcs, grid, :v)
     w_bcs = regularize_field_boundary_conditions(default_bcs, grid, :w)
 
-    Ru = CenterField(grid, boundary_conditions=u_bcs)
-    Rv = CenterField(grid, boundary_conditions=v_bcs)
-    Rw = CenterField(grid, boundary_conditions=w_bcs)
+    Ru, Rv, Rw = VelocityFields(grid, (; u = u_bcs, v = v_bcs, w = w_bcs))
+
     U = (u=Ru, v=Rv, w=Rw)
 
     Nx, Ny, Nz = size(grid)
-    set!(Ru, rand(Nx, Ny, Nz))
-    set!(Rv, rand(Nx, Ny, Nz))
-    set!(Rw, rand(Nx, Ny, Nz))
+
+    set!(Ru, rand(size(Ru)...))
+    set!(Rv, rand(size(Rv)...))
+    set!(Rw, rand(size(Rw)...))
 
     fill_halo_regions!(Ru)
     fill_halo_regions!(Rv)
@@ -46,18 +46,20 @@ function random_divergence_free_source_term(grid)
     w_bcs = regularize_field_boundary_conditions(default_bcs, grid, :w)
 
     # Random right hand side
-    Ru = CenterField(grid, boundary_conditions=u_bcs)
-    Rv = CenterField(grid, boundary_conditions=v_bcs)
-    Rw = CenterField(grid, boundary_conditions=w_bcs)
+    Ru, Rv, Rw = VelocityFields(grid, (; u = u_bcs, v = v_bcs, w = w_bcs))
+
     U = (u=Ru, v=Rv, w=Rw)
 
     Nx, Ny, Nz = size(grid)
-    set!(Ru, rand(Nx, Ny, Nz))
-    set!(Rv, rand(Nx, Ny, Nz))
-    set!(Rw, zeros(Nx, Ny, Nz))
 
+    set!(Ru, rand(size(Ru)...))
+    set!(Rv, rand(size(Rv)...))
+    set!(Rw, rand(size(Rw)...))
+
+    fill_halo_regions!(Ru)
+    fill_halo_regions!(Rv)
+    
     arch = architecture(grid)
-    fill_halo_regions!((Ru, Rv, Rw))
 
     compute_w_from_continuity!(U, arch, grid)
     fill_halo_regions!(Rw)
diff --git a/test/dependencies_for_runtests.jl b/test/dependencies_for_runtests.jl
index 60147f1809..3e15e0a3d8 100644
--- a/test/dependencies_for_runtests.jl
+++ b/test/dependencies_for_runtests.jl
@@ -1,22 +1,28 @@
+using Oceananigans
 using Test
 using Printf
 using Random
 using Statistics
 using LinearAlgebra
 using Logging
-
+using Enzyme
+using SparseArrays
+using JLD2
+using FFTW
+using OffsetArrays
+using SeawaterPolynomials
 using CUDA
 using MPI
 
+using Dates: DateTime, Nanosecond
+using Statistics: mean, mean!, norm
+using LinearAlgebra: norm
+using NCDatasets: Dataset
+using KernelAbstractions: @kernel, @index
+
 MPI.versioninfo()
 MPI.Initialized() || MPI.Init()
 
-using JLD2
-using FFTW
-using OffsetArrays
-using SeawaterPolynomials
-
-using Oceananigans
 using Oceananigans.Architectures
 using Oceananigans.Grids
 using Oceananigans.Operators
@@ -29,6 +35,7 @@ using Oceananigans.BuoyancyModels
 using Oceananigans.Forcings
 using Oceananigans.Solvers
 using Oceananigans.Models
+using Oceananigans.MultiRegion
 using Oceananigans.Simulations
 using Oceananigans.Diagnostics
 using Oceananigans.OutputWriters
@@ -37,20 +44,23 @@ using Oceananigans.DistributedComputations
 using Oceananigans.Logger
 using Oceananigans.Units
 using Oceananigans.Utils
-using Oceananigans.MultiRegion
-using Oceananigans.Architectures: device, array_type # to resolve conflict with CUDA.device
-using Oceananigans.Architectures: on_architecture
 
 using Oceananigans: Clock
-using Dates: DateTime, Nanosecond
-using Statistics: mean
-using LinearAlgebra: norm
-using NCDatasets: Dataset
-using KernelAbstractions: @kernel, @index
+using Oceananigans.Architectures: device, array_type # to resolve conflict with CUDA.device
+using Oceananigans.Architectures: on_architecture
+using Oceananigans.AbstractOperations: UnaryOperation, Derivative, BinaryOperation, MultiaryOperation
+using Oceananigans.AbstractOperations: KernelFunctionOperation
+using Oceananigans.BuoyancyModels: BuoyancyField
+using Oceananigans.Grids: architecture
+using Oceananigans.Fields: ZeroField, ConstantField, FunctionField, compute_at!, indices
+using Oceananigans.Models.HydrostaticFreeSurfaceModels: tracernames
+using Oceananigans.ImmersedBoundaries: conditional_length
+using Oceananigans.Operators: ℑxyᶜᶠᵃ, ℑxyᶠᶜᵃ, hack_cosd
+using Oceananigans.Solvers: constructors, unpack_constructors
+using Oceananigans.TurbulenceClosures: with_tracers
+using Oceananigans.MultiRegion: reconstruct_global_grid, reconstruct_global_field, getnamewrapper
 
-import Oceananigans.Fields: interior
 import Oceananigans.Utils: launch!, datatuple
-
 Logging.global_logger(OceananigansLogger())
 
 #####
@@ -66,7 +76,16 @@ closures = (
     :ConvectiveAdjustmentVerticalDiffusivity,
 )
 
-include("utils_for_runtests.jl")
+if !(@isdefined already_included)
+    already_included = Ref(false)
+    macro include_once(expr)
+        return !(already_included[]) ? :($(esc(expr))) : :(nothing)
+    end
+end
+
+@include_once include("utils_for_runtests.jl")
+already_included[] = true
 
 float_types = (Float32, Float64)
 archs = test_architectures()
+
diff --git a/test/regression_tests/hydrostatic_free_turbulence_regression_test.jl b/test/regression_tests/hydrostatic_free_turbulence_regression_test.jl
index 7b52aaadfc..8e52a910d5 100644
--- a/test/regression_tests/hydrostatic_free_turbulence_regression_test.jl
+++ b/test/regression_tests/hydrostatic_free_turbulence_regression_test.jl
@@ -6,7 +6,7 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: HydrostaticFreeSurfaceMo
 using Oceananigans.TurbulenceClosures: HorizontalScalarDiffusivity
 
 using Oceananigans.DistributedComputations: Distributed, DistributedGrid, DistributedComputations, all_reduce
-using Oceananigans.DistributedComputations: reconstruct_global_topology, partition_global_array, cpu_architecture
+using Oceananigans.DistributedComputations: reconstruct_global_topology, partition, cpu_architecture
 
 using JLD2
 
@@ -121,10 +121,10 @@ function run_hydrostatic_free_turbulence_regression_test(grid, free_surface; reg
         # Data was saved with 2 halos per direction (see issue #3260)
         H = 2
         truth_fields = (
-            u = partition_global_array(cpu_arch, file["timeseries/u/$stop_iteration"][H+1:end-H, H+1:end-H, H+1:end-H], size(u)),
-            v = partition_global_array(cpu_arch, file["timeseries/v/$stop_iteration"][H+1:end-H, H+1:end-H, H+1:end-H], size(v)),
-            w = partition_global_array(cpu_arch, file["timeseries/w/$stop_iteration"][H+1:end-H, H+1:end-H, H+1:end-H], size(w)),
-            η = partition_global_array(cpu_arch, file["timeseries/η/$stop_iteration"][H+1:end-H, H+1:end-H, :], size(η))
+            u = partition(file["timeseries/u/$stop_iteration"][H+1:end-H, H+1:end-H, H+1:end-H], cpu_arch, size(u)),
+            v = partition(file["timeseries/v/$stop_iteration"][H+1:end-H, H+1:end-H, H+1:end-H], cpu_arch, size(v)),
+            w = partition(file["timeseries/w/$stop_iteration"][H+1:end-H, H+1:end-H, H+1:end-H], cpu_arch, size(w)),
+            η = partition(file["timeseries/η/$stop_iteration"][H+1:end-H, H+1:end-H, :], cpu_arch, size(η))
         )
 
         close(file)
diff --git a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
index 395d8ecb60..8a9ac391e9 100644
--- a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
+++ b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
@@ -1,5 +1,6 @@
 using Oceananigans.TurbulenceClosures: AnisotropicMinimumDissipation
 using Oceananigans.TimeSteppers: update_state!
+using Oceananigans.DistributedComputations: cpu_architecture, partition
 
 function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closure)
     name = "ocean_large_eddy_simulation_" * string(typeof(first(closure)).name.wrapper)
@@ -23,14 +24,15 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
     end
 
     # Boundary conditions
-    u_bcs = FieldBoundaryConditions(top = BoundaryCondition(Flux, Qᵘ))
-    T_bcs = FieldBoundaryConditions(top = BoundaryCondition(Flux, Qᵀ), bottom = BoundaryCondition(Gradient, ∂T∂z))
-    S_bcs = FieldBoundaryConditions(top = BoundaryCondition(Flux, 5e-8))
+    u_bcs = FieldBoundaryConditions(top = BoundaryCondition(Flux(), Qᵘ))
+    T_bcs = FieldBoundaryConditions(top = BoundaryCondition(Flux(), Qᵀ), bottom = BoundaryCondition(Gradient(), ∂T∂z))
+    S_bcs = FieldBoundaryConditions(top = BoundaryCondition(Flux(), 5e-8))
 
     equation_of_state = LinearEquationOfState(thermal_expansion=2e-4, haline_contraction=8e-4)
 
     # Model instantiation
     model = NonhydrostaticModel(; grid, closure,
+                                timestepper = :QuasiAdamsBashforth2,
                                 coriolis = FPlane(f=1e-4),
                                 buoyancy = SeawaterBuoyancy(; equation_of_state),
                                 tracers = (:T, :S),
@@ -39,6 +41,10 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
 
     # The type of the underlying data, not the offset array.
     ArrayType = typeof(model.velocities.u.data.parent)
+    nx, ny, nz = size(model.tracers.T)
+
+    u, v, w = model.velocities
+    T, S = model.tracers
 
     ####
     #### Uncomment the block below to generate regression data.
@@ -77,32 +83,49 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
 
     Nz = grid.Nz
 
-    solution_indices   = [2:N+3, 2:N+3, 2:N+3]
-    w_solution_indices = [2:N+3, 2:N+3, 2:N+4]
-
-    parent(model.velocities.u)[solution_indices...]   .= ArrayType(solution₀.u)
-    parent(model.velocities.v)[solution_indices...]   .= ArrayType(solution₀.v)
-    parent(model.velocities.w)[w_solution_indices...] .= ArrayType(solution₀.w)
-    parent(model.tracers.T)[solution_indices...]      .= ArrayType(solution₀.T)
-    parent(model.tracers.S)[solution_indices...]      .= ArrayType(solution₀.S)
-
-    parent(model.timestepper.Gⁿ.u)[solution_indices...]   .= ArrayType(Gⁿ₀.u)
-    parent(model.timestepper.Gⁿ.v)[solution_indices...]   .= ArrayType(Gⁿ₀.v)
-    parent(model.timestepper.Gⁿ.w)[w_solution_indices...] .= ArrayType(Gⁿ₀.w)
-    parent(model.timestepper.Gⁿ.T)[solution_indices...]   .= ArrayType(Gⁿ₀.T)
-    parent(model.timestepper.Gⁿ.S)[solution_indices...]   .= ArrayType(Gⁿ₀.S)
-
-    parent(model.timestepper.G⁻.u)[solution_indices...]   .= ArrayType(G⁻₀.u)
-    parent(model.timestepper.G⁻.v)[solution_indices...]   .= ArrayType(G⁻₀.v)
-    parent(model.timestepper.G⁻.w)[w_solution_indices...] .= ArrayType(G⁻₀.w)
-    parent(model.timestepper.G⁻.T)[solution_indices...]   .= ArrayType(G⁻₀.T)
-    parent(model.timestepper.G⁻.S)[solution_indices...]   .= ArrayType(G⁻₀.S)
+    cpu_arch = cpu_architecture(architecture(grid))
+
+    u₀ = partition(ArrayType(solution₀.u[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(u))
+    v₀ = partition(ArrayType(solution₀.v[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(v))
+    w₀ = partition(ArrayType(solution₀.w[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(w))
+    T₀ = partition(ArrayType(solution₀.T[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(T))
+    S₀ = partition(ArrayType(solution₀.S[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(S))
+
+    Gⁿu₀ = partition(ArrayType(Gⁿ₀.u)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(u))
+    Gⁿv₀ = partition(ArrayType(Gⁿ₀.v)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(v))
+    Gⁿw₀ = partition(ArrayType(Gⁿ₀.w)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(w))
+    GⁿT₀ = partition(ArrayType(Gⁿ₀.T)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(T))
+    GⁿS₀ = partition(ArrayType(Gⁿ₀.S)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(S))
+
+    G⁻u₀ = partition(ArrayType(G⁻₀.u)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(u))
+    G⁻v₀ = partition(ArrayType(G⁻₀.v)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(v))
+    G⁻w₀ = partition(ArrayType(G⁻₀.w)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(w))
+    G⁻T₀ = partition(ArrayType(G⁻₀.T)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(T))
+    G⁻S₀ = partition(ArrayType(G⁻₀.S)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(S))
+
+    interior(model.velocities.u) .= u₀
+    interior(model.velocities.v) .= v₀
+    interior(model.velocities.w) .= w₀
+    interior(model.tracers.T)    .= T₀
+    interior(model.tracers.S)    .= S₀
+
+    interior(model.timestepper.Gⁿ.u) .= Gⁿu₀
+    interior(model.timestepper.Gⁿ.v) .= Gⁿv₀
+    interior(model.timestepper.Gⁿ.w) .= Gⁿw₀
+    interior(model.timestepper.Gⁿ.T) .= GⁿT₀
+    interior(model.timestepper.Gⁿ.S) .= GⁿS₀
+
+    interior(model.timestepper.G⁻.u) .= G⁻u₀
+    interior(model.timestepper.G⁻.v) .= G⁻v₀
+    interior(model.timestepper.G⁻.w) .= G⁻w₀
+    interior(model.timestepper.G⁻.T) .= G⁻T₀
+    interior(model.timestepper.G⁻.S) .= G⁻S₀
 
     model.clock.time = spinup_steps * Δt
     model.clock.iteration = spinup_steps
 
     update_state!(model; compute_tendencies = true)
-    model.timestepper.previous_Δt = Δt
+    model.clock.last_Δt = Δt
 
     for n in 1:test_steps
         time_step!(model, Δt, euler=false)
@@ -115,15 +138,23 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
 
     test_fields = CUDA.@allowscalar (u = Array(interior(model.velocities.u)),
                                      v = Array(interior(model.velocities.v)),
-                                     w = Array(interior(model.velocities.w)[:, :, 1:N]),
+                                     w = Array(interior(model.velocities.w)[:, :, 1:nz]),
                                      T = Array(interior(model.tracers.T)),
                                      S = Array(interior(model.tracers.S)))
 
-    correct_fields = (u = Array(solution₁.u)[2:N+1, 2:N+1, 2:N+1],
-                      v = Array(solution₁.v)[2:N+1, 2:N+1, 2:N+1],
-                      w = Array(solution₁.w)[2:N+1, 2:N+1, 2:N+1],
-                      T = Array(solution₁.T)[2:N+1, 2:N+1, 2:N+1],
-                      S = Array(solution₁.S)[2:N+1, 2:N+1, 2:N+1])
+    u₁ = partition(Array(solution₁.u)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(u))
+    v₁ = partition(Array(solution₁.v)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(v))
+    w₁ = partition(Array(solution₁.w)[2:end-1, 2:end-1, 2:end-2], cpu_arch, size(test_fields.w))
+    T₁ = partition(Array(solution₁.T)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(T))
+    S₁ = partition(Array(solution₁.S)[2:end-1, 2:end-1, 2:end-1], cpu_arch, size(S))
+
+    @show size(test_fields.w), size(w₁)
+
+    correct_fields = (u = u₁,
+                      v = v₁,
+                      w = w₁,
+                      T = T₁,
+                      S = S₁)
 
     summarize_regression_test(test_fields, correct_fields)
 
diff --git a/test/regression_tests/rayleigh_benard_regression_test.jl b/test/regression_tests/rayleigh_benard_regression_test.jl
index 2f64eb30aa..30395690ac 100644
--- a/test/regression_tests/rayleigh_benard_regression_test.jl
+++ b/test/regression_tests/rayleigh_benard_regression_test.jl
@@ -1,5 +1,6 @@
 using Oceananigans.Grids: xnode, znode
 using Oceananigans.TimeSteppers: update_state!
+using Oceananigans.DistributedComputations: cpu_architecture, partition, reconstruct_global_grid
 
 function run_rayleigh_benard_regression_test(arch, grid_type)
 
@@ -43,10 +44,11 @@ function run_rayleigh_benard_regression_test(arch, grid_type)
 
     cforcing = Forcing(Fc, discrete_form=true)
 
-    bbcs = FieldBoundaryConditions(top = BoundaryCondition(Value, 0.0),
-                                   bottom = BoundaryCondition(Value, Δb))
+    bbcs = FieldBoundaryConditions(top = BoundaryCondition(Value(), 0.0),
+                                   bottom = BoundaryCondition(Value(), Δb))
 
     model = NonhydrostaticModel(; grid,
+                                timestepper = :QuasiAdamsBashforth2,
                                 closure = ScalarDiffusivity(ν=ν, κ=κ),
                                 tracers = (:b, :c),
                                 buoyancy = Buoyancy(model=BuoyancyTracer()),
@@ -71,6 +73,9 @@ function run_rayleigh_benard_regression_test(arch, grid_type)
     checkpointer = Checkpointer(model, schedule=IterationInterval(test_steps), prefix=prefix,
                                 dir=joinpath(dirname(@__FILE__), "data"))
 
+    u, v, w = model.velocities
+    b, c = model.tracers
+
     #####
     ##### Initial condition and spinup steps for creating regression test data
     #####
@@ -100,23 +105,39 @@ function run_rayleigh_benard_regression_test(arch, grid_type)
 
     solution₀, Gⁿ₀, G⁻₀ = get_fields_from_checkpoint(initial_filename)
 
-    model.velocities.u.data.parent .= ArrayType(solution₀.u)
-    model.velocities.v.data.parent .= ArrayType(solution₀.v)
-    model.velocities.w.data.parent .= ArrayType(solution₀.w)
-    model.tracers.b.data.parent    .= ArrayType(solution₀.b)
-    model.tracers.c.data.parent    .= ArrayType(solution₀.c)
+    cpu_arch = cpu_architecture(architecture(grid))
+
+    u₀ = partition(ArrayType(solution₀.u[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(u))
+    v₀ = partition(ArrayType(solution₀.v[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(v))
+    w₀ = partition(ArrayType(solution₀.w[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(w))
+    b₀ = partition(ArrayType(solution₀.b[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(b))
+    c₀ = partition(ArrayType(solution₀.c[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(c))
+
+    Gⁿu₀ = partition(ArrayType(Gⁿ₀.u[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(u))
+    Gⁿv₀ = partition(ArrayType(Gⁿ₀.v[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(v))
+    Gⁿw₀ = partition(ArrayType(Gⁿ₀.w[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(w))
+    Gⁿb₀ = partition(ArrayType(Gⁿ₀.b[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(b))
+    Gⁿc₀ = partition(ArrayType(Gⁿ₀.c[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(c))
+
+    G⁻u₀ = partition(ArrayType(G⁻₀.u[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(u))
+    G⁻v₀ = partition(ArrayType(G⁻₀.v[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(v))
+    G⁻w₀ = partition(ArrayType(G⁻₀.w[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(w))
+    G⁻b₀ = partition(ArrayType(G⁻₀.b[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(b))
+    G⁻c₀ = partition(ArrayType(G⁻₀.c[2:end-1, 2:end-1, 2:end-1]), cpu_arch, size(c))
 
-    model.timestepper.Gⁿ.u.data.parent .= ArrayType(Gⁿ₀.u)
-    model.timestepper.Gⁿ.v.data.parent .= ArrayType(Gⁿ₀.v)
-    model.timestepper.Gⁿ.w.data.parent .= ArrayType(Gⁿ₀.w)
-    model.timestepper.Gⁿ.b.data.parent .= ArrayType(Gⁿ₀.b)
-    model.timestepper.Gⁿ.c.data.parent .= ArrayType(Gⁿ₀.c)
+    set!(model, u = u₀, v = v₀, w = w₀, b = b₀, c = c₀)
 
-    model.timestepper.G⁻.u.data.parent .= ArrayType(G⁻₀.u)
-    model.timestepper.G⁻.v.data.parent .= ArrayType(G⁻₀.v)
-    model.timestepper.G⁻.w.data.parent .= ArrayType(G⁻₀.w)
-    model.timestepper.G⁻.b.data.parent .= ArrayType(G⁻₀.b)
-    model.timestepper.G⁻.c.data.parent .= ArrayType(G⁻₀.c)
+    set!(model.timestepper.Gⁿ.u, Gⁿu₀)
+    set!(model.timestepper.Gⁿ.v, Gⁿv₀)
+    set!(model.timestepper.Gⁿ.w, Gⁿw₀)
+    set!(model.timestepper.Gⁿ.b, Gⁿb₀)
+    set!(model.timestepper.Gⁿ.c, Gⁿc₀)
+
+    set!(model.timestepper.G⁻.u, G⁻u₀)
+    set!(model.timestepper.G⁻.v, G⁻v₀)
+    set!(model.timestepper.G⁻.w, G⁻w₀)
+    set!(model.timestepper.G⁻.b, G⁻b₀)
+    set!(model.timestepper.G⁻.c, G⁻c₀)
 
     model.clock.iteration = spinup_steps
     model.clock.time = spinup_steps * Δt
@@ -125,7 +146,7 @@ function run_rayleigh_benard_regression_test(arch, grid_type)
     # Step the model forward and perform the regression test
     update_state!(model)
 
-    model.timestepper.previous_Δt = Δt
+    model.clock.last_Δt = Δt
 
     for n in 1:test_steps
         time_step!(model, Δt, euler=false)
@@ -142,20 +163,28 @@ function run_rayleigh_benard_regression_test(arch, grid_type)
                                       b = Array(interior(model.tracers.b)),
                                       c = Array(interior(model.tracers.c)))
 
-    correct_fields = (u = Array(interior(solution₁.u, model.grid)),
-                      v = Array(interior(solution₁.v, model.grid)),
-                      w = Array(interior(solution₁.w, model.grid)),
-                      b = Array(interior(solution₁.b, model.grid)),
-                      c = Array(interior(solution₁.c, model.grid)))
+    global_grid = reconstruct_global_grid(model.grid)
+
+    u₁ = interior(solution₁.u, global_grid)
+    v₁ = interior(solution₁.v, global_grid)
+    w₁ = interior(solution₁.w, global_grid)
+    b₁ = interior(solution₁.b, global_grid)
+    c₁ = interior(solution₁.c, global_grid)
+
+    reference_fields = (u = partition(Array(u₁), cpu_arch, size(u)),
+                        v = partition(Array(v₁), cpu_arch, size(v)),
+                        w = partition(Array(w₁), cpu_arch, size(test_fields.w)),
+                        b = partition(Array(b₁), cpu_arch, size(b)),
+                        c = partition(Array(c₁), cpu_arch, size(c)))
 
-    summarize_regression_test(test_fields, correct_fields)
+    summarize_regression_test(test_fields, reference_fields)
 
     CUDA.allowscalar(true)
-    @test all(test_fields.u .≈ correct_fields.u)
-    @test all(test_fields.v .≈ correct_fields.v)
-    @test all(test_fields.w .≈ correct_fields.w)
-    @test all(test_fields.b .≈ correct_fields.b)
-    @test all(test_fields.c .≈ correct_fields.c)
+    @test all(test_fields.u .≈ reference_fields.u)
+    @test all(test_fields.v .≈ reference_fields.v)
+    @test all(test_fields.w .≈ reference_fields.w)
+    @test all(test_fields.b .≈ reference_fields.b)
+    @test all(test_fields.c .≈ reference_fields.c)
     CUDA.allowscalar(false)
 
     return nothing
diff --git a/test/regression_tests/thermal_bubble_regression_test.jl b/test/regression_tests/thermal_bubble_regression_test.jl
index ad6de22327..24e6c90dbf 100644
--- a/test/regression_tests/thermal_bubble_regression_test.jl
+++ b/test/regression_tests/thermal_bubble_regression_test.jl
@@ -1,3 +1,5 @@
+using Oceananigans.DistributedComputations: cpu_architecture, partition
+
 function run_thermal_bubble_regression_test(arch, grid_type)
     Nx, Ny, Nz = 16, 16, 16
     Lx, Ly, Lz = 100, 100, 100
@@ -13,6 +15,7 @@ function run_thermal_bubble_regression_test(arch, grid_type)
     closure = ScalarDiffusivity(ν=4e-2, κ=4e-2)
 
     model = NonhydrostaticModel(; grid, closure,
+                                timestepper = :QuasiAdamsBashforth2,
                                 coriolis = FPlane(f=1e-4),
                                 buoyancy = SeawaterBuoyancy(),
                                 hydrostatic_pressure_anomaly = CenterField(grid),
@@ -70,19 +73,27 @@ function run_thermal_bubble_regression_test(arch, grid_type)
     copyto!(test_fields.T, interior(model.tracers.T))
     copyto!(test_fields.S, interior(model.tracers.S))
 
-    correct_fields = (u = ds["u"][:, :, :, end],
-                      v = ds["v"][:, :, :, end],
-                      w = ds["w"][:, :, :, end],
-                      T = ds["T"][:, :, :, end],
-                      S = ds["S"][:, :, :, end])
+    reference_fields = (u = ds["u"][:, :, :, end],
+                        v = ds["v"][:, :, :, end],
+                        w = ds["w"][:, :, :, end],
+                        T = ds["T"][:, :, :, end],
+                        S = ds["S"][:, :, :, end])
+
+    cpu_arch = cpu_architecture(architecture(grid))
+
+    reference_fields = (u = partition(reference_fields.u, cpu_arch, size(reference_fields.u)),
+                        v = partition(reference_fields.v, cpu_arch, size(reference_fields.v)),
+                        w = partition(reference_fields.w, cpu_arch, size(reference_fields.w)),
+                        T = partition(reference_fields.T, cpu_arch, size(reference_fields.T)),
+                        S = partition(reference_fields.S, cpu_arch, size(reference_fields.S)))
 
-    summarize_regression_test(test_fields, correct_fields)
+    summarize_regression_test(test_fields, reference_fields)
     
-    @test all(test_fields.u .≈ correct_fields.u)
-    @test all(test_fields.v .≈ correct_fields.v)
-    @test all(test_fields.w .≈ correct_fields.w)
-    @test all(test_fields.T .≈ correct_fields.T)
-    @test all(test_fields.S .≈ correct_fields.S)
+    @test all(test_fields.u .≈ reference_fields.u)
+    @test all(test_fields.v .≈ reference_fields.v)
+    @test all(test_fields.w .≈ reference_fields.w)
+    @test all(test_fields.T .≈ reference_fields.T)
+    @test all(test_fields.S .≈ reference_fields.S)
     
     return nothing
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index b954e1d14b..c78eeefbb6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,3 +1,5 @@
+using Pkg
+
 include("dependencies_for_runtests.jl")
 
 group     = get(ENV, "TEST_GROUP", :all) |> Symbol
@@ -17,12 +19,29 @@ end
 CUDA.allowscalar() do
 
 @testset "Oceananigans" begin
+
     if test_file != :none
         @testset "Single file test" begin
             include(String(test_file))
         end
     end
 
+    # Initialization steps
+    if group == :init || group == :all
+        Pkg.instantiate(; verbose=true)
+        Pkg.precompile(; strict=true)
+        Pkg.status()
+
+        try
+            MPI.versioninfo()
+        catch; end
+
+        try
+            CUDA.precompile_runtime()
+            CUDA.versioninfo()
+        catch; end
+    end
+
     # Core Oceananigans
     if group == :unit || group == :all
         @testset "Unit tests" begin
@@ -152,21 +171,29 @@ CUDA.allowscalar() do
 
     if group == :distributed || group == :all
         MPI.Initialized() || MPI.Init()
+        archs = test_architectures()
         include("test_distributed_models.jl")
     end
 
     if group == :distributed_solvers || group == :all
         MPI.Initialized() || MPI.Init()
+        include("test_distributed_transpose.jl")
         include("test_distributed_poisson_solvers.jl")
     end
 
     if group == :distributed_hydrostatic_model || group == :all
         MPI.Initialized() || MPI.Init()
-        archs = test_architectures() 
+        archs = test_architectures()
         include("test_hydrostatic_regression.jl")
         include("test_distributed_hydrostatic_model.jl")
     end
 
+    if group == :distributed_nonhydrostatic_regression || group == :all
+        MPI.Initialized() || MPI.Init()
+        archs = nonhydrostatic_regression_test_architectures()
+        include("test_nonhydrostatic_regression.jl")
+    end
+
     if group == :nonhydrostatic_regression || group == :all
         include("test_nonhydrostatic_regression.jl")
     end
diff --git a/test/test_abstract_operations.jl b/test/test_abstract_operations.jl
index 5a0aa9ea31..c9942895c8 100644
--- a/test/test_abstract_operations.jl
+++ b/test/test_abstract_operations.jl
@@ -1,9 +1,5 @@
 include("dependencies_for_runtests.jl")
 
-using Oceananigans.Operators: ℑxyᶜᶠᵃ, ℑxyᶠᶜᵃ
-using Oceananigans.Fields: ZeroField, ConstantField, compute_at!, indices
-using Oceananigans.BuoyancyModels: BuoyancyField
-
 function simple_binary_operation(op, a, b, num1, num2)
     a_b = op(a, b)
     interior(a) .= num1
@@ -371,3 +367,4 @@ for arch in archs
         end
     end
 end
+
diff --git a/test/test_boundary_conditions.jl b/test/test_boundary_conditions.jl
index 06e3859f58..0cb88f4330 100644
--- a/test/test_boundary_conditions.jl
+++ b/test/test_boundary_conditions.jl
@@ -21,7 +21,7 @@ end
     @testset "Boundary condition instantiation" begin
         @info "  Testing boundary condition instantiation..."
 
-        for C in (Value, Gradient, Flux)
+        for C in (Value, Gradient, Flux, Value(), Gradient(), Flux())
             @test can_instantiate_boundary_condition(integer_bc, C)
             @test can_instantiate_boundary_condition(irrational_bc, C)
             @test can_instantiate_boundary_condition(simple_function_bc, C)
@@ -235,7 +235,7 @@ end
         @test T_bcs.top.condition.func === simple_bc
         @test T_bcs.bottom.condition.func === simple_bc
 
-        one_bc = BoundaryCondition(Value, 1.0)
+        one_bc = BoundaryCondition(Value(), 1.0)
 
         T_bcs = FieldBoundaryConditions(   east = one_bc,
                                            west = one_bc,
diff --git a/test/test_boundary_conditions_integration.jl b/test/test_boundary_conditions_integration.jl
index 1f7a9ae84a..5113906191 100644
--- a/test/test_boundary_conditions_integration.jl
+++ b/test/test_boundary_conditions_integration.jl
@@ -13,7 +13,7 @@ function test_boundary_condition(arch, FT, topo, side, field_name, boundary_cond
                                 buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
 
     success = try
-        time_step!(model, 1e-16, euler=true)
+        time_step!(model, 1e-16)
         true
     catch err
         @warn "test_boundary_condition errored with " * sprint(showerror, err)
@@ -27,7 +27,7 @@ function test_nonhydrostatic_flux_budget(grid, name, side, L)
     FT = eltype(grid)
     flux = FT(π)
     direction = side ∈ (:west, :south, :bottom, :immersed) ? 1 : -1
-    bc_kwarg = Dict(side => BoundaryCondition(Flux, flux * direction))
+    bc_kwarg = Dict(side => BoundaryCondition(Flux(), flux * direction))
     field_bcs = FieldBoundaryConditions(; bc_kwarg...)
     boundary_conditions = (; name => field_bcs)
 
@@ -61,16 +61,18 @@ function fluxes_with_diffusivity_boundary_conditions_are_correct(arch, FT)
     κₑ_bcs = FieldBoundaryConditions(grid, (Center, Center, Center), bottom=ValueBoundaryCondition(κ₀))
     model_bcs = (b=buoyancy_bcs, κₑ=(b=κₑ_bcs,))
 
-    model = NonhydrostaticModel(
-        grid=grid, tracers=:b, buoyancy=BuoyancyTracer(),
-        closure=AnisotropicMinimumDissipation(), boundary_conditions=model_bcs
-    )
+    model = NonhydrostaticModel(; grid,
+                                timestepper = :QuasiAdamsBashforth2,
+                                tracers = :b,
+                                buoyancy = BuoyancyTracer(),
+                                closure = AnisotropicMinimumDissipation(),
+                                boundary_conditions = model_bcs)
 
     b₀(x, y, z) = z * bz
     set!(model, b=b₀)
 
     b = model.tracers.b
-    mean_b₀ = mean(interior(b))
+    mean_b₀ = mean(b)
 
     τκ = Lz^2 / κ₀  # Diffusion time-scale
     Δt = 1e-6 * τκ  # Time step much less than diffusion time-scale
@@ -98,7 +100,7 @@ function fluxes_with_diffusivity_boundary_conditions_are_correct(arch, FT)
     # mean(interior(b)) - mean_b₀ = -3.141592656086267e-5
     # (flux * model.clock.time) / Lz = -3.141592653589793e-5
     
-    return isapprox(mean(interior(b)) - mean_b₀, flux * model.clock.time / Lz, atol=1e-6)
+    return isapprox(mean(b) - mean_b₀, flux * model.clock.time / Lz, atol=1e-6)
 end
 
 test_boundary_conditions(C, FT, ArrayType) = (integer_bc(C, FT, ArrayType),
@@ -226,7 +228,8 @@ test_boundary_conditions(C, FT, ArrayType) = (integer_bc(C, FT, ArrayType),
 
             rectilinear_grid(topology) = RectilinearGrid(arch; topology, z=(0, Lz), grid_kw...)
             immersed_rectilinear_grid(topology) = ImmersedBoundaryGrid(RectilinearGrid(arch; topology, z=(-Lz, Lz), grid_kw...), ib)
-            grids_to_test(topo) = [rectilinear_grid(topo), immersed_rectilinear_grid(topo)]
+            immersed_active_rectilinear_grid(topology) = ImmersedBoundaryGrid(RectilinearGrid(arch; topology, z=(-Lz, Lz), grid_kw...), ib; active_cells_map = true)
+            grids_to_test(topo) = [rectilinear_grid(topo), immersed_rectilinear_grid(topo), immersed_active_rectilinear_grid(topo)]
 
             for grid in grids_to_test((Periodic, Bounded, Bounded))
                 for name in (:u, :c)
diff --git a/test/test_checkpointer.jl b/test/test_checkpointer.jl
index 234bd590ef..c32fad63b3 100644
--- a/test/test_checkpointer.jl
+++ b/test/test_checkpointer.jl
@@ -22,6 +22,25 @@ function test_model_equality(test_model, true_model)
     return nothing
 end
 
+""" Set up a simple simulation to test picking up from a checkpoint. """
+function initialization_test_simulation(arch, stop_time, Δt=1, δt=2)
+    grid = RectilinearGrid(arch, size=(), topology=(Flat, Flat, Flat))
+    model = NonhydrostaticModel(; grid)
+    simulation = Simulation(model; Δt, stop_time)
+
+    progress_message(sim) = @info string("Iter: ", iteration(sim), ", time: ", prettytime(sim))
+    simulation.callbacks[:progress] = Callback(progress_message, TimeInterval(δt))
+
+    checkpointer = Checkpointer(model,
+                                schedule = TimeInterval(stop_time),
+                                prefix = "initialization_test",
+                                cleanup = false)
+
+    simulation.output_writers[:checkpointer] = checkpointer
+
+    return simulation
+end
+
 """
 Run two coarse rising thermal bubble simulations and make sure
 
@@ -34,10 +53,7 @@ Run two coarse rising thermal bubble simulations and make sure
 3. run!(test_model, pickup) works as expected
 """
 function test_thermal_bubble_checkpointer_output(arch)
-    #####
-    ##### Create and run "true model"
-    #####
-
+    # Create and run "true model"
     Nx, Ny, Nz = 16, 16, 16
     Lx, Ly, Lz = 100, 100, 100
     Δt = 6
@@ -58,10 +74,7 @@ function test_thermal_bubble_checkpointer_output(arch)
 end
 
 function test_hydrostatic_splash_checkpointer(arch, free_surface)
-    #####
-    ##### Create and run "true model"
-    #####
-
+    # Create and run "true model"
     Nx, Ny, Nz = 16, 16, 4
     Lx, Ly, Lz = 1, 1, 1
 
@@ -78,7 +91,6 @@ function test_hydrostatic_splash_checkpointer(arch, free_surface)
 end
 
 function run_checkpointer_tests(true_model, test_model, Δt)
-
     true_simulation = Simulation(true_model, Δt=Δt, stop_iteration=5)
 
     checkpointer = Checkpointer(true_model, schedule=IterationInterval(5), overwrite_existing=true)
@@ -102,7 +114,7 @@ function run_checkpointer_tests(true_model, test_model, Δt)
     test_model_equality(test_model, checkpointed_model)
 
     # This only applies to QuasiAdamsBashforthTimeStepper:
-    @test test_model.timestepper.previous_Δt == checkpointed_model.timestepper.previous_Δt
+    @test test_model.clock.last_Δt == checkpointed_model.clock.last_Δt
 
     #####
     ##### Test pickup from explicit checkpoint path
@@ -162,10 +174,7 @@ end
 
 function run_checkpointer_cleanup_tests(arch)
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
-    model = NonhydrostaticModel(grid=grid,
-                                buoyancy=SeawaterBuoyancy(), tracers=(:T, :S)
-                                )
-
+    model = NonhydrostaticModel(; grid, buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
     simulation = Simulation(model, Δt=0.2, stop_iteration=10)
 
     simulation.output_writers[:checkpointer] = Checkpointer(model, schedule=IterationInterval(3), cleanup=true)
@@ -191,5 +200,20 @@ for arch in archs
         end
 
         run_checkpointer_cleanup_tests(arch)
+
+        # Run a simulation that saves data to a checkpoint
+        rm("initialization_test_iteration*.jld2", force=true)
+        simulation = initialization_test_simulation(arch, 4)
+        run!(simulation)
+
+        # Now try again, but picking up from the previous checkpoint
+        N = iteration(simulation)
+        checkpoint = "initialization_test_iteration$N.jld2"
+        simulation = initialization_test_simulation(arch, 8)
+        run!(simulation, pickup=checkpoint)
+
+        progress_cb = simulation.callbacks[:progress]
+        progress_cb.schedule.first_actuation_time
+        @test progress_cb.schedule.first_actuation_time == 4
     end
 end
diff --git a/test/test_computed_field.jl b/test/test_computed_field.jl
index 2f76ceaf23..c19ae97918 100644
--- a/test/test_computed_field.jl
+++ b/test/test_computed_field.jl
@@ -1,11 +1,5 @@
 include("dependencies_for_runtests.jl")
 
-using Oceananigans.AbstractOperations: UnaryOperation, Derivative, BinaryOperation, MultiaryOperation
-using Oceananigans.AbstractOperations: KernelFunctionOperation
-using Oceananigans.Operators: ℑxyᶜᶠᵃ, ℑxyᶠᶜᵃ
-using Oceananigans.Fields: compute_at!
-using Oceananigans.BuoyancyModels: BuoyancyField
-
 function compute_derivative(model, ∂)
     T, S = model.tracers
     parent(S) .= π
@@ -340,8 +334,9 @@ for arch in archs
         underlying_grid = RectilinearGrid(arch, size=(4, 4, 4), extent=(1, 1, 1), topology=(Periodic, Periodic, Bounded))
         bottom(x, y) = -2 # below the grid!
         immersed_grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(bottom))
+        immersed_active_grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(bottom); active_cells_map = true)
 
-        for grid in (underlying_grid, immersed_grid)
+        for grid in (underlying_grid, immersed_grid, immersed_active_grid)
             G = typeof(grid).name.wrapper
             model = NonhydrostaticModel(; grid, buoyancy, tracers = (:T, :S))
 
diff --git a/test/test_conditional_reductions.jl b/test/test_conditional_reductions.jl
index 85bbffa26b..d083da0ee5 100644
--- a/test/test_conditional_reductions.jl
+++ b/test/test_conditional_reductions.jl
@@ -1,9 +1,5 @@
 include("dependencies_for_runtests.jl")
 
-using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid, GridFittedBoundary
-using Oceananigans.ImmersedBoundaries: conditional_length
-using Statistics: mean, mean!, norm
-
 @testset "Conditional Reductions" begin
     for arch in archs
         @info "    Testing Reductions on Immersed fields"
diff --git a/test/test_cubed_spheres.jl b/test/test_cubed_spheres.jl
index be97ffaa33..d8984cb0a1 100644
--- a/test/test_cubed_spheres.jl
+++ b/test/test_cubed_spheres.jl
@@ -2,10 +2,139 @@ include("dependencies_for_runtests.jl")
 include("data_dependencies.jl")
 
 using Statistics: mean
+using Oceananigans.Operators
 using Oceananigans.CubedSpheres
 using Oceananigans.Models.HydrostaticFreeSurfaceModels
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: VerticalVorticityField
 
+using OrthogonalSphericalShellGrids
+    
+# To be used in the test below as `KernelFunctionOperation`s
+@inline intrinsic_vector_x_component(i, j, k, grid, uₑ, vₑ) = 
+    @inbounds intrinsic_vector(i, j, k, grid, uₑ, vₑ)[1]
+    
+@inline intrinsic_vector_y_component(i, j, k, grid, uₑ, vₑ) =
+    @inbounds intrinsic_vector(i, j, k, grid, uₑ, vₑ)[2]
+
+@inline extrinsic_vector_x_component(i, j, k, grid, uₑ, vₑ) =
+    @inbounds intrinsic_vector(i, j, k, grid, uₑ, vₑ)[1]
+    
+@inline extrinsic_vector_y_component(i, j, k, grid, uₑ, vₑ) =
+    @inbounds intrinsic_vector(i, j, k, grid, uₑ, vₑ)[2]
+
+function kinetic_energy(u, v)
+    ke = Field(0.5 * (u * u + v * v))
+    return compute!(ke)
+end
+
+function test_vector_rotation(grid)
+    u = XFaceField(grid)
+    v = YFaceField(grid)
+    
+    # Purely longitudinal flow in the extrinsic coordinate system
+    set!(u, 1)
+    set!(v, 0)
+
+    # Convert it to an "Instrinsic" reference frame
+    uᵢ = KernelFunctionOperation{Face, Center, Center}(intrinsic_vector_x_component, grid, u, v)
+    vᵢ = KernelFunctionOperation{Center, Face, Center}(intrinsic_vector_y_component, grid, u, v)
+    
+    uᵢ = compute!(Field(uᵢ))
+    vᵢ = compute!(Field(vᵢ))
+
+    # The extrema of u and v, as well as their mean value should
+    # be equivalent on an "intrinsic" frame
+    @test maximum(uᵢ) ≈ maximum(vᵢ)
+    @test minimum(uᵢ) ≈ minimum(vᵢ)
+    @test mean(uᵢ) ≈ mean(vᵢ)
+    @test mean(uᵢ) > 0 # The mean value should be positive
+
+    # Kinetic energy should remain the same
+    KE = kinetic_energy(uᵢ, vᵢ)
+    @test all(on_architecture(CPU(), interior(KE)) .≈ 0.5)
+
+    # Convert it back to a purely zonal velocity (vₑ == 0)
+    uₑ = KernelFunctionOperation{Face, Center, Center}(extrinsic_vector_x_component, grid, uᵢ, vᵢ)
+    vₑ = KernelFunctionOperation{Center, Face, Center}(extrinsic_vector_y_component, grid, uᵢ, vᵢ)
+    
+    uₑ = compute!(Field(uₑ))
+    vₑ = compute!(Field(vₑ))
+
+    # Make sure that the flow was converted back to a 
+    # purely zonal flow in the extrensic frame (v ≈ 0)
+    @test all(on_architecture(CPU(), interior(vₑ)) .≈ 0)
+    @test all(on_architecture(CPU(), interior(uₑ)) .≈ 1)
+
+    # Purely meridional flow in the extrinsic coordinate system
+    set!(u, 0)
+    set!(v, 1)
+
+    # Convert it to an "Instrinsic" reference frame
+    uᵢ = KernelFunctionOperation{Face, Center, Center}(intrinsic_vector_x_component, grid, u, v)
+    vᵢ = KernelFunctionOperation{Center, Face, Center}(intrinsic_vector_y_component, grid, u, v)
+    
+    uᵢ = compute!(Field(uᵢ))
+    vᵢ = compute!(Field(vᵢ))
+
+    # The extrema of u and v, as well as their mean value should
+    # be equivalent on an "intrinsic" frame
+    @test maximum(uᵢ) ≈ maximum(vᵢ)
+    @test minimum(uᵢ) ≈ minimum(vᵢ)
+    @test mean(uᵢ) ≈ mean(vᵢ)
+    @test mean(vᵢ) > 0 # The mean value should be positive
+
+    # Kinetic energy should remain the same
+    KE = kinetic_energy(uᵢ, vᵢ)
+    @test all(on_architecture(CPU(), interior(KE)) .≈ 0.5)
+
+    # Convert it back to a purely zonal velocity (vₑ == 0)
+    uₑ = KernelFunctionOperation{Face, Center, Center}(extrinsic_vector_x_component, grid, uᵢ, vᵢ)
+    vₑ = KernelFunctionOperation{Center, Face, Center}(extrinsic_vector_y_component, grid, uᵢ, vᵢ)
+    
+    uₑ = compute!(Field(uₑ))
+    vₑ = compute!(Field(vₑ))
+
+    # Make sure that the flow was converted back to a 
+    # purely zonal flow in the extrensic frame (v ≈ 0)
+    @test all(on_architecture(CPU(), interior(vₑ)) .≈ 1)
+    @test all(on_architecture(CPU(), interior(uₑ)) .≈ 0)
+
+    # Mixed zonal and meridional flow.
+    set!(u, 0.5)
+    set!(v, 0.5)
+
+    # Convert it to an "Instrinsic" reference frame
+    uᵢ = KernelFunctionOperation{Face, Center, Center}(intrinsic_vector_x_component, grid, u, v)
+    vᵢ = KernelFunctionOperation{Center, Face, Center}(intrinsic_vector_y_component, grid, u, v)
+    
+    uᵢ = compute!(Field(uᵢ))
+    vᵢ = compute!(Field(vᵢ))
+
+    # The extrema of u and v, as well as their mean value should
+    # be equivalent on an "intrinsic" frame
+    @test maximum(uᵢ) ≈ maximum(vᵢ)
+    @test minimum(uᵢ) ≈ minimum(vᵢ)
+    @test mean(uᵢ) ≈ mean(vᵢ)
+    @test mean(vᵢ) > 0 # The mean value should be positive
+
+    # Kinetic energy should remain the same
+    KE = kinetic_energy(uᵢ, vᵢ)
+    @test all(on_architecture(CPU(), interior(KE)) .≈ 0.25)
+
+    # Convert it back to a purely zonal velocity (vₑ == 0)
+    uₑ = KernelFunctionOperation{Face, Center, Center}(extrinsic_vector_x_component, grid, uᵢ, vᵢ)
+    vₑ = KernelFunctionOperation{Center, Face, Center}(extrinsic_vector_y_component, grid, uᵢ, vᵢ)
+    
+    uₑ = compute!(Field(uₑ))
+    vₑ = compute!(Field(vₑ))
+
+    # Make sure that the flow was converted back to a 
+    # purely zonal flow in the extrensic frame (v ≈ 0)
+    @test all(on_architecture(CPU(), interior(vₑ)) .≈ 0.5)
+    @test all(on_architecture(CPU(), interior(uₑ)) .≈ 0.5)
+end
+    
+
 @testset "Cubed spheres" begin
 
     @testset "Conformal cubed sphere grid" begin
@@ -41,6 +170,14 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: VerticalVorticityField
                 @test grid isa ConformalCubedSphereGrid
             end
 
+            @testset "Conversion from Intrinsic to Extrinsic reference frame [$(typeof(arch))]" begin
+                @info "  Testing the conversion of a vector between the Intrinsic and Extrinsic reference frame"
+                trg_grid = TripolarGrid(arch, size = (20, 20, 1), z = (0, 1))
+
+                test_vector_rotation(grid)
+                test_vector_rotation(trg_grid)
+            end
+
             @testset "CubedSphereData and CubedSphereFields [$(typeof(arch))]" begin
                 @info "  Testing CubedSphereData and CubedSphereFields [$(typeof(arch))]..."
                 c = model.tracers.c
diff --git a/test/test_diagnostics.jl b/test/test_diagnostics.jl
index b176551989..91a37c532e 100644
--- a/test/test_diagnostics.jl
+++ b/test/test_diagnostics.jl
@@ -139,6 +139,20 @@ function advective_timescale_cfl_on_lat_lon_grid(arch, FT)
     return cfl(model) ≈ CFL_by_hand
 end
 
+function advective_timescale_cfl_on_flat_2d_grid(arch, FT)
+    Δx = 0.5
+    topo = (Periodic, Flat, Bounded)
+    grid = RectilinearGrid(arch, FT, topology=topo, size=(3, 3), x=(0, 3Δx), z=(0, 3Δx))
+
+    model = NonhydrostaticModel(; grid)
+    set!(model, v=1)
+
+    Δt = FT(1.7)
+    cfl = CFL(FT(Δt), Oceananigans.Advection.cell_advection_timescale)
+
+    return cfl(model) == 0
+end
+
 get_iteration(model) = model.clock.iteration
 get_time(model) = model.clock.time
 
@@ -177,6 +191,7 @@ end
                 @test advective_timescale_cfl_on_regular_grid(arch, FT)
                 @test advective_timescale_cfl_on_stretched_grid(arch, FT)
                 @test advective_timescale_cfl_on_lat_lon_grid(arch, FT)
+                @test advective_timescale_cfl_on_flat_2d_grid(arch, FT)
             end
         end
     end
diff --git a/test/test_distributed_hydrostatic_model.jl b/test/test_distributed_hydrostatic_model.jl
index 52ac851ebc..c11d3c5248 100644
--- a/test/test_distributed_hydrostatic_model.jl
+++ b/test/test_distributed_hydrostatic_model.jl
@@ -26,7 +26,7 @@ MPI.Initialized() || MPI.Init()
 # to initialize MPI.
 
 using Oceananigans.Operators: hack_cosd
-using Oceananigans.DistributedComputations: partition_global_array, all_reduce, cpu_architecture, reconstruct_global_grid
+using Oceananigans.DistributedComputations: partition, all_reduce, cpu_architecture, reconstruct_global_grid
 
 function Δ_min(grid) 
     Δx_min = minimum_xspacing(grid, Center(), Center(), Center())
@@ -88,12 +88,13 @@ for arch in archs
 
         bottom(λ, φ) = -30 < λ < 30 && -40 < φ < 20 ? 0 : - 1
 
-        immersed_grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(bottom); active_cells_map = true)
+        immersed_grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(bottom))
+        immersed_active_grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(bottom); active_cells_map = true)
 
         global_underlying_grid = reconstruct_global_grid(underlying_grid)
-        global_immersed_grid   = ImmersedBoundaryGrid(global_underlying_grid, GridFittedBottom(bottom); active_cells_map = true)
+        global_immersed_grid   = ImmersedBoundaryGrid(global_underlying_grid, GridFittedBottom(bottom))
 
-        for (grid, global_grid) in zip((underlying_grid, immersed_grid), (global_underlying_grid, global_immersed_grid))
+        for (grid, global_grid) in zip((underlying_grid, immersed_grid, immersed_active_grid), (global_underlying_grid, global_immersed_grid, global_immersed_grid))
 
             # "s" for "serial" computation
             us, vs, ws, cs, ηs = solid_body_rotation_test(global_grid)
@@ -115,11 +116,11 @@ for arch in archs
             c = interior(on_architecture(cpu_arch, c))
             η = interior(on_architecture(cpu_arch, η))
 
-            us = partition_global_array(cpu_arch, us, size(u))
-            vs = partition_global_array(cpu_arch, vs, size(v))
-            ws = partition_global_array(cpu_arch, ws, size(w))
-            cs = partition_global_array(cpu_arch, cs, size(c))
-            ηs = partition_global_array(cpu_arch, ηs, size(η))
+            us = partition(us, cpu_arch, size(u))
+            vs = partition(vs, cpu_arch, size(v))
+            ws = partition(ws, cpu_arch, size(w))
+            cs = partition(cs, cpu_arch, size(c))
+            ηs = partition(ηs, cpu_arch, size(η))
 
             atol = eps(eltype(grid))
             rtol = sqrt(eps(eltype(grid)))
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 1648d4b155..12d45191c9 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -381,7 +381,7 @@ end
 
 function test_triply_periodic_halo_communication_with_221_ranks(halo, child_arch)
     arch = Distributed(child_arch; partition=Partition(2, 2))
-    grid = RectilinearGrid(arch, topology=(Periodic, Periodic, Periodic), size=(8, 8, 3), extent=(1, 2, 3), halo=halo)
+    grid = RectilinearGrid(arch, topology=(Periodic, Periodic, Periodic), size=(8, 8, 4), extent=(1, 2, 3), halo=halo)
     model = NonhydrostaticModel(grid=grid)
 
     for field in merge(fields(model))
diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
index 772e128902..dca406d8c3 100644
--- a/test/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -1,13 +1,20 @@
-include("dependencies_for_runtests.jl")
-
 using MPI
+MPI.Init()
+
+# Make sure results are 
+# reproducible
+using Random
+Random.seed!(1234)
+
+include("dependencies_for_runtests.jl")
+include("dependencies_for_poisson_solvers.jl")
 
-# # Distributed model tests
+# # Distributed Poisson Solver tests
 #
 # These tests are meant to be run on 4 ranks. This script may be run
 # stand-alone (outside the test environment) via
 #
-# mpiexec -n 4 julia --project test_distributed_models.jl
+# mpiexec -n 4 julia --project test_distributed_poisson_solver.jl
 #
 # provided that a few packages (like TimesDates.jl) are in your global environment.
 #
@@ -17,40 +24,37 @@ using MPI
 #
 # then later:
 # 
-# julia> include("test_distributed_models.jl")
+# julia> include("test_distributed_poisson_solver.jl")
 #
 # When running the tests this way, uncomment the following line
 
-MPI.Init()
-
 # to initialize MPI.
 
-using Oceananigans.DistributedComputations: reconstruct_global_grid, Partition
-using Oceananigans.DistributedComputations: ZXYPermutation, ZYXPermutation
+using Oceananigans.DistributedComputations: reconstruct_global_grid, DistributedGrid, Partition, DistributedFourierTridiagonalPoissonSolver
+using Oceananigans.Models.NonhydrostaticModels: solve_for_pressure!
 
-@kernel function set_distributed_solver_input!(permuted_ϕ, ϕ, ::ZYXPermutation)
-    i, j, k = @index(Global, NTuple)
-    @inbounds permuted_ϕ[k, j, i] = ϕ[i, j, k]
-end
+function random_divergent_source_term(grid::DistributedGrid)
+    arch = architecture(grid)
+    default_bcs = FieldBoundaryConditions()
 
-@kernel function set_distributed_solver_input!(permuted_ϕ, ϕ, ::ZXYPermutation)
-    i, j, k = @index(Global, NTuple)
-    @inbounds permuted_ϕ[k, i, j] = ϕ[i, j, k]
-end
+    u_bcs = regularize_field_boundary_conditions(default_bcs, grid, :u)
+    v_bcs = regularize_field_boundary_conditions(default_bcs, grid, :v)
+    w_bcs = regularize_field_boundary_conditions(default_bcs, grid, :w)
 
-function random_divergent_source_term(grid)
-    # Generate right hand side from a random (divergent) velocity field.
-    Ru = XFaceField(grid)
-    Rv = YFaceField(grid)
-    Rw = ZFaceField(grid)
+    u_bcs = inject_halo_communication_boundary_conditions(u_bcs, arch.local_rank, arch.connectivity, topology(grid))
+    v_bcs = inject_halo_communication_boundary_conditions(v_bcs, arch.local_rank, arch.connectivity, topology(grid))
+    w_bcs = inject_halo_communication_boundary_conditions(w_bcs, arch.local_rank, arch.connectivity, topology(grid))
+
+    Ru = XFaceField(grid, boundary_conditions=u_bcs)
+    Rv = YFaceField(grid, boundary_conditions=v_bcs)
+    Rw = ZFaceField(grid, boundary_conditions=w_bcs)
     U = (u=Ru, v=Rv, w=Rw)
 
     Nx, Ny, Nz = size(grid)
-    set!(Ru, (x, y, z) -> rand())
-    set!(Rv, (x, y, z) -> rand())
-    set!(Rw, (x, y, z) -> rand())
+    set!(Ru, rand(size(Ru)...))
+    set!(Rv, rand(size(Rv)...))
+    set!(Rw, rand(size(Rw)...))
 
-    arch = architecture(grid)
     fill_halo_regions!(Ru)
     fill_halo_regions!(Rv)
     fill_halo_regions!(Rw)
@@ -59,52 +63,97 @@ function random_divergent_source_term(grid)
     ArrayType = array_type(arch)
     R = zeros(Nx, Ny, Nz) |> ArrayType
     launch!(arch, grid, :xyz, divergence!, grid, U.u.data, U.v.data, U.w.data, R)
-
-    return R
+    
+    return R, U
 end
 
-function divergence_free_poisson_solution_triply_periodic(grid_points, ranks)
-    arch = Distributed(CPU(), partition=Partition(ranks...))
-    local_grid = RectilinearGrid(arch, topology=(Periodic, Periodic, Periodic), size=grid_points, extent=(1, 2, 3))
-
-    bcs = FieldBoundaryConditions(local_grid, (Center, Center, Center))
-    bcs = inject_halo_communication_boundary_conditions(bcs, arch.local_rank, arch.connectivity, (Periodic, Periodic, Periodic))
+function divergence_free_poisson_solution(grid_points, ranks, topo, child_arch)
+    arch = Distributed(child_arch, partition=Partition(ranks...))
+    local_grid = RectilinearGrid(arch, topology=topo, size=grid_points, extent=(2π, 2π, 2π))
 
     # The test will solve for ϕ, then compare R to ∇²ϕ.
-    ϕ   = CenterField(local_grid, boundary_conditions=bcs)
-    ∇²ϕ = CenterField(local_grid, boundary_conditions=bcs)
-    R   = random_divergent_source_term(local_grid)
+    ϕ   = CenterField(local_grid)
+    ∇²ϕ = CenterField(local_grid)
+    R, U = random_divergent_source_term(local_grid)
 
     global_grid = reconstruct_global_grid(local_grid)
     solver = DistributedFFTBasedPoissonSolver(global_grid, local_grid)
+    
+    # Using Δt = 1.
+    solve_for_pressure!(ϕ, solver, 1, U)
 
-    # Solve it
-    ϕc = first(solver.storage)
+    # "Recompute" ∇²ϕ
+    compute_∇²!(∇²ϕ, ϕ, arch, local_grid)
 
-    launch!(arch, local_grid, :xyz, set_distributed_solver_input!, ϕc, R, solver.input_permutation)
+    return Array(interior(∇²ϕ)) ≈ Array(R)
+end
+
+function divergence_free_poisson_tridiagonal_solution(grid_points, ranks, stretched_direction, child_arch)
+    arch = Distributed(child_arch, partition=Partition(ranks...))
+    
+    if stretched_direction == :x
+        x = collect(range(0, 2π, length = grid_points[1]+1))
+        y = z = (0, 2π)
+    elseif stretched_direction == :y
+        y = collect(range(0, 2π, length = grid_points[2]+1))
+        x = z = (0, 2π)
+    elseif stretched_direction == :z
+        z = collect(range(0, 2π, length = grid_points[3]+1))
+        x = y = (0, 2π)
+    end
+        
+    local_grid = RectilinearGrid(arch; topology=(Bounded, Bounded, Bounded), size=grid_points, x, y, z)
 
-    solve!(ϕ, solver)
+    # The test will solve for ϕ, then compare R to ∇²ϕ.
+    ϕ   = CenterField(local_grid)
+    ∇²ϕ = CenterField(local_grid)
+    R, U = random_divergent_source_term(local_grid)
+
+    global_grid = reconstruct_global_grid(local_grid)
+    solver = DistributedFourierTridiagonalPoissonSolver(global_grid, local_grid)
+    
+    # Using Δt = 1.
+    solve_for_pressure!(ϕ, solver, 1, U)
 
     # "Recompute" ∇²ϕ
     compute_∇²!(∇²ϕ, ϕ, arch, local_grid)
 
-    return R ≈ interior(∇²ϕ)
+    return Array(interior(∇²ϕ)) ≈ Array(R)
 end
 
 @testset "Distributed FFT-based Poisson solver" begin
-    @info "  Testing 3D distributed FFT-based Poisson solver..."
-    @test divergence_free_poisson_solution_triply_periodic((44, 44, 8), (1, 4, 1))
-    @test divergence_free_poisson_solution_triply_periodic((44, 16, 8), (1, 4, 1))
-    @test divergence_free_poisson_solution_triply_periodic((16, 44, 8), (1, 4, 1))
-    @test divergence_free_poisson_solution_triply_periodic((44, 16, 8), (2, 2, 1))
-    @test divergence_free_poisson_solution_triply_periodic((16, 44, 8), (2, 2, 1))
-
-    @info "  Testing 2D distributed FFT-based Poisson solver..."
-    @test divergence_free_poisson_solution_triply_periodic((44, 16, 1), (1, 4, 1))
-    @test divergence_free_poisson_solution_triply_periodic((44, 16, 1), (4, 1, 1))
-    @test divergence_free_poisson_solution_triply_periodic((16, 44, 1), (1, 4, 1))
-    @test divergence_free_poisson_solution_triply_periodic((16, 44, 1), (4, 1, 1))
-
-    @test_throws ArgumentError divergence_free_poisson_solution_triply_periodic((16, 44, 1), (2, 2, 1))
-    @test_throws ArgumentError divergence_free_poisson_solution_triply_periodic((44, 16, 1), (2, 2, 1))
+    child_arch = test_child_arch()
+    
+    for topology in ((Periodic, Periodic, Periodic), 
+                     (Periodic, Periodic, Bounded),
+                     (Periodic, Bounded, Bounded),
+                     (Bounded, Bounded, Bounded))
+                    
+        @info "  Testing 3D distributed FFT-based Poisson solver with topology $topology..."
+        @test divergence_free_poisson_solution((44, 44, 8), (4, 1, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((16, 44, 8), (4, 1, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((44, 44, 8), (1, 4, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((44, 16, 8), (1, 4, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((16, 44, 8), (1, 4, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((22, 44, 8), (2, 2, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((44, 22, 8), (2, 2, 1), topology, child_arch)
+
+        @info "  Testing 2D distributed FFT-based Poisson solver with topology $topology..."
+        @test divergence_free_poisson_solution((44, 16, 1), (4, 1, 1), topology, child_arch)
+        @test divergence_free_poisson_solution((16, 44, 1), (4, 1, 1), topology, child_arch)
+    end
+
+    for stretched_direction in (:x, :y, :z)
+        @info "  Testing 3D distributed Fourier Tridiagonal Poisson solver stretched in $stretched_direction"
+        @test divergence_free_poisson_tridiagonal_solution((44, 44, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((44,  4, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((16, 44, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((22,  8, 8), (2, 2, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution(( 8, 22, 8), (2, 2, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((44, 44, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((44,  4, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((16, 44, 8), (1, 4, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution((22,  8, 8), (2, 2, 1), stretched_direction, child_arch)
+        @test divergence_free_poisson_tridiagonal_solution(( 8, 22, 8), (2, 2, 1), stretched_direction, child_arch)
+    end
 end
diff --git a/test/test_distributed_transpose.jl b/test/test_distributed_transpose.jl
new file mode 100644
index 0000000000..5bc0fe6292
--- /dev/null
+++ b/test/test_distributed_transpose.jl
@@ -0,0 +1,57 @@
+using MPI
+
+MPI.Init()
+
+include("dependencies_for_runtests.jl")
+
+using Oceananigans.DistributedComputations: TransposableField,                
+                                            transpose_z_to_y!,                
+                                            transpose_y_to_z!,                
+                                            transpose_y_to_x!,                
+                                            transpose_x_to_y!
+
+function test_transpose(grid_points, ranks, topo, child_arch)
+    arch = Distributed(child_arch, partition=Partition(ranks...))
+    grid = RectilinearGrid(arch, topology=topo, size=grid_points, extent=(2π, 2π, 2π))
+
+    loc = (Center, Center, Center)
+    ϕ = Field(loc, grid, ComplexF64)
+    Φ = TransposableField(ϕ)
+
+    ϕ₀ = on_architecture(child_arch, rand(ComplexF64, size(ϕ)))
+
+    # Fill ϕ with random data
+    set!(ϕ, ϕ₀)
+    set!(Φ.zfield, ϕ)
+    
+    # Complete a full transposition cycle
+    transpose_z_to_y!(Φ)
+    transpose_y_to_x!(Φ)
+    transpose_x_to_y!(Φ)
+    transpose_y_to_z!(Φ)
+
+    # Check that the data is unchanged
+    same_real_part = all(real.(Array(interior(ϕ))) .== real.(Array(interior(Φ.zfield))))
+    same_imag_part = all(imag.(Array(interior(ϕ))) .== imag.(Array(interior(Φ.zfield))))
+
+    return same_real_part & same_imag_part
+end
+
+@testset "Distributed Transpose" begin
+    child_arch = test_child_arch()
+
+    for topology in ((Periodic, Periodic, Periodic), 
+                     (Periodic, Periodic, Bounded),
+                     (Periodic, Bounded, Bounded),
+                     (Bounded, Bounded, Bounded))
+        @info "  Testing 3D transpose with topology $topology..."
+        @test test_transpose((44, 44, 8), (4, 1, 1), topology, child_arch)
+        @test test_transpose((16, 44, 8), (4, 1, 1), topology, child_arch)
+        @test test_transpose((44, 44, 8), (1, 4, 1), topology, child_arch)
+        @test test_transpose((44, 16, 8), (1, 4, 1), topology, child_arch)
+        @test test_transpose((16, 44, 8), (1, 4, 1), topology, child_arch)
+        @test test_transpose((44, 16, 8), (2, 2, 1), topology, child_arch)
+        @test test_transpose((16, 44, 8), (2, 2, 1), topology, child_arch)
+    end
+end
+
diff --git a/test/test_dynamics.jl b/test/test_dynamics.jl
index 99b03f6699..9097150946 100644
--- a/test/test_dynamics.jl
+++ b/test/test_dynamics.jl
@@ -88,7 +88,7 @@ end
 function test_immersed_diffusion(Nz, z, time_discretization)
     closure         = ScalarDiffusivity(time_discretization, κ = 1)
     underlying_grid = RectilinearGrid(size=Nz, z=z, topology=(Flat, Flat, Bounded))
-    grid            = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(() -> 0))
+    grid            = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(() -> 0); active_cells_map = true)
 
     Δz_min = minimum(underlying_grid.Δzᵃᵃᶜ)
     model_kwargs = (tracers=:c, buoyancy=nothing, velocities=PrescribedVelocityFields())
@@ -131,7 +131,7 @@ function test_3D_immersed_diffusion(Nz, z, time_discretization)
                   b b b b b b b b b]
 
     underlying_grid = RectilinearGrid(size=(9, 9, Nz), x=(0, 1), y=(0, 1), z=z, topology=(Periodic, Periodic, Bounded))
-    grid            = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(bathymetry))
+    grid            = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(bathymetry); active_cells_map = true)
     
     Δz_min = minimum(grid.underlying_grid.Δzᵃᵃᶜ)
     model_kwargs = (tracers=:c, buoyancy=nothing, velocities=PrescribedVelocityFields())
diff --git a/test/test_ensemble_hydrostatic_free_surface_models.jl b/test/test_ensemble_hydrostatic_free_surface_models.jl
index 17736968be..95ab1b701c 100644
--- a/test/test_ensemble_hydrostatic_free_surface_models.jl
+++ b/test/test_ensemble_hydrostatic_free_surface_models.jl
@@ -4,6 +4,41 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: ColumnEnsembleSize, Slic
 using Oceananigans.TurbulenceClosures: ConvectiveAdjustmentVerticalDiffusivity
 const CAVD = ConvectiveAdjustmentVerticalDiffusivity
 
+@testset "`HydrostaticFreeSurfaceModel` using a `SingleColumnGrid`" begin
+
+    Nz = 3
+    Hz = 1
+    single_column_topology = (Flat, Flat, Bounded)
+    periodic_topology = (Periodic, Periodic, Bounded)
+
+    single_column_grid = RectilinearGrid(; size=Nz, z=(-1, 0), topology = single_column_topology, halo=Hz)
+    periodic_grid = RectilinearGrid(; size=(1, 1, Nz), x = (0, 1), y = (0, 1), z=(-1, 0), topology = periodic_topology, halo=(1, 1, Hz))
+    coriolis = FPlane(f=0.2)
+    closure  = CAVD(background_κz=1.0)
+
+    Δt = 0.01
+
+    model_kwargs = (; tracers=:c, buoyancy=nothing, closure, coriolis)
+    simulation_kwargs = (; Δt, stop_iteration=100)
+
+    sic_model = HydrostaticFreeSurfaceModel(; grid = single_column_grid, model_kwargs...)
+    per_model = HydrostaticFreeSurfaceModel(; grid = periodic_grid,      model_kwargs...)
+
+    set!(sic_model, c = z         -> exp(-z^2), u = 1, v = 1)
+    set!(per_model, c = (x, y, z) -> exp(-z^2), u = 1, v = 1)
+
+    sic_simulation = Simulation(sic_model; simulation_kwargs...)
+    per_simulation = Simulation(per_model; simulation_kwargs...)
+    run!(sic_simulation)
+    run!(per_simulation)
+    
+    @info "Testing Single column grid results..."
+    
+    @test all(sic_model.velocities.u.data[1, 1, :] .≈ per_model.velocities.u.data[1, 1, :])
+    @test all(sic_model.velocities.v.data[1, 1, :] .≈ per_model.velocities.v.data[1, 1, :])
+    @test all(sic_model.tracers.c.data[1, 1, :]    .≈ per_model.tracers.c.data[1, 1, :])
+end
+
 @testset "Ensembles of `HydrostaticFreeSurfaceModel` with different closures" begin
 
     Nz = 16
diff --git a/test/test_enzyme.jl b/test/test_enzyme.jl
index e9b9397d68..00ff0b90ba 100644
--- a/test/test_enzyme.jl
+++ b/test/test_enzyme.jl
@@ -1,25 +1,238 @@
-using Oceananigans
-using Enzyme
+include("dependencies_for_runtests.jl")
 
 # Required presently
-Enzyme.API.runtimeActivity!(true)
+Enzyme.API.looseTypeAnalysis!(true)
+Enzyme.API.maxtypeoffset!(2032)
 
-EnzymeRules.inactive_type(::Type{<:Oceananigans.Grids.AbstractGrid}) = true
+# OceananigansLogger doesn't work here -- not sure why
+Logging.global_logger(TestLogger())
 
 f(grid) = CenterField(grid)
+const maximum_diffusivity = 100
+
+"""
+    set_diffusivity!(model, diffusivity)
+
+Change diffusivity of model to `diffusivity`.
+"""
+function set_diffusivity!(model, diffusivity)
+    closure = VerticalScalarDiffusivity(; κ=diffusivity)
+    names = tuple(:c) # tracernames(model.tracers)
+    closure = with_tracers(names, closure)
+    model.closure = closure
+    return nothing
+end
+
+function set_initial_condition!(model, amplitude)
+    amplitude = Ref(amplitude)
+
+    # This has a "width" of 0.1
+    cᵢ(x, y, z) = amplitude[] * exp(-z^2 / 0.02 - (x^2 + y^2) / 0.05)
+    set!(model, c=cᵢ)
+
+    return nothing
+end
+
+function stable_diffusion!(model, amplitude, diffusivity)
+    set_diffusivity!(model, diffusivity)
+    set_initial_condition!(model, amplitude)
+    
+    # Do time-stepping
+    Nx, Ny, Nz = size(model.grid)
+    κ_max = maximum_diffusivity
+    Δz = 1 / Nz
+    Δt = 1e-1 * Δz^2 / κ_max
+
+    model.clock.time = 0
+    model.clock.iteration = 0
+
+    for _ = 1:10
+        time_step!(model, Δt; euler=true)
+    end
+
+    # Compute scalar metric
+    c = model.tracers.c
+
+    # Hard way (for enzyme - the sum function sometimes errors with AD)
+    # c² = c^2
+    # sum_c² = sum(c²)
+
+    # Another way to compute it
+    sum_c² = 0.0
+    for k = 1:Nz, j = 1:Ny,  i = 1:Nx
+        sum_c² += c[i, j, k]^2
+    end
+
+    # Need the ::Float64 for type inference with automatic differentiation
+    return sum_c²::Float64
+end
 
 @testset "Enzyme Unit Tests" begin
-    arch=CPU()
-    FT=Float64
+    arch = CPU()
+    FT = Float64
 
     N = 100
     topo = (Periodic, Flat, Flat)
     grid = RectilinearGrid(arch, FT, topology=topo, size=N, halo=2, x=(-1, 1), y=(-1, 1), z=(-1, 1))
     fwd, rev = Enzyme.autodiff_thunk(ReverseSplitWithPrimal, Const{typeof(f)}, Duplicated, typeof(Const(grid)))
+    tape, primal, shadowp = fwd(Const(f), Const(grid))
 
-    tape, primal, shadow = fwd(Const(f), Const(grid) )
+    @show tape primal shadowp
 
-    @show tape, primal, shadow
+    shadow = if shadowp isa Base.RefValue
+        shadowp[]
+    else
+        shadowp
+    end
 
     @test size(primal) == size(shadow)
 end
+
+function set_initial_condition_via_launch!(model_tracer, amplitude)
+    # Set initial condition
+    amplitude = Ref(amplitude)
+    cᵢ(x, y, z) = amplitude[]
+
+    temp = Base.broadcasted(Base.identity, FunctionField((Center, Center, Center), cᵢ, model_tracer.grid))
+
+    temp = convert(Base.Broadcast.Broadcasted{Nothing}, temp)
+    grid = model_tracer.grid
+    arch = architecture(model_tracer)
+
+    param = Oceananigans.Utils.KernelParameters(size(model_tracer), map(Oceananigans.Fields.offset_index, model_tracer.indices))
+    Oceananigans.Utils.launch!(arch, grid, param, Oceananigans.Fields._broadcast_kernel!, model_tracer, temp)
+
+    return nothing
+end
+
+@testset "Enzyme + Oceananigans Initialization Broadcast Kernel" begin
+
+    Nx = Ny = 64
+    Nz = 8
+
+    x = y = (-π, π)
+    z = (-0.5, 0.5)
+    topology = (Periodic, Periodic, Bounded)
+
+    grid = RectilinearGrid(size=(Nx, Ny, Nz); x, y, z, topology)
+    model = HydrostaticFreeSurfaceModel(; grid, tracers=:c)
+    model_tracer = model.tracers.c
+
+    amplitude = 1.0
+    amplitude = Ref(amplitude)
+    cᵢ(x, y, z) = amplitude[]
+    temp = Base.broadcasted(Base.identity, FunctionField((Center, Center, Center), cᵢ, model_tracer.grid))
+
+    temp = convert(Base.Broadcast.Broadcasted{Nothing}, temp)
+    grid = model_tracer.grid
+    arch = architecture(model_tracer)
+
+    if arch == CPU()
+        param = Oceananigans.Utils.KernelParameters(size(model_tracer),
+                                                    map(Oceananigans.Fields.offset_index, model_tracer.indices))
+        dmodel_tracer = Enzyme.make_zero(model_tracer)
+
+        # Test the individual kernel launch
+        autodiff(Enzyme.set_runtime_activity(Enzyme.Reverse),
+                 Oceananigans.Utils.launch!,
+                 Const(arch),
+                 Const(grid),
+                 Const(param),
+                 Const(Oceananigans.Fields._broadcast_kernel!),
+                 Duplicated(model_tracer, dmodel_tracer),
+                 Const(temp))
+
+        # Test out differentiation of the broadcast infrastructure
+        autodiff(Enzyme.set_runtime_activity(Enzyme.Reverse),
+                 set_initial_condition_via_launch!,
+                 Duplicated(model_tracer, dmodel_tracer),
+                 Active(1.0))
+
+        # Test differentiation of the high-level set interface
+        dmodel = Enzyme.make_zero(model)
+        autodiff(Enzyme.set_runtime_activity(Enzyme.Reverse),
+                 set_initial_condition!,
+                 Duplicated(model, dmodel),
+                 Active(1.0))
+    end
+end
+
+@testset "Enzyme on advection and diffusion" begin
+    Nx = Ny = 64
+    Nz = 8
+
+    Lx = Ly = L = 2π
+    Lz = 1
+
+    x = y = (-L/2, L/2)
+    z = (-Lz/2, Lz/2)
+    topology = (Periodic, Periodic, Bounded)
+
+    grid = RectilinearGrid(size=(Nx, Ny, Nz); x, y, z, topology)
+    diffusion = VerticalScalarDiffusivity(κ=0.1)
+
+    u = XFaceField(grid)
+    v = YFaceField(grid)
+
+    U = 1
+    u₀(x, y, z) = - U * cos(x + L/8) * sin(y) * (z + L/2)
+    v₀(x, y, z) = + U * sin(x + L/8) * cos(y) * (z + L/2)
+
+    set!(u, u₀)
+    set!(v, v₀)
+    fill_halo_regions!(u)
+    fill_halo_regions!(v)
+
+    @inline function tracer_flux(x, y, t, c, p)
+        c₀ = p.surface_tracer_concentration
+        u★ = p.piston_velocity
+        return - u★ * (c₀ - c)
+    end
+
+    parameters = (surface_tracer_concentration = 1,
+                  piston_velocity = 0.1)
+
+    top_c_bc = FluxBoundaryCondition(tracer_flux, field_dependencies=:c; parameters)
+    c_bcs = FieldBoundaryConditions(top=top_c_bc)
+
+    # TODO:
+    # 1. Make the velocity fields evolve
+    # 2. Add surface fluxes
+    # 3. Do a problem where we invert for the tracer fluxes (maybe with CATKE)
+
+    model = HydrostaticFreeSurfaceModel(; grid,
+                                        tracer_advection = WENO(),
+                                        tracers = :c,
+                                        velocities = PrescribedVelocityFields(; u, v),
+                                        closure = diffusion)
+
+    # Compute derivative by hand
+    κ₁, κ₂ = 0.9, 1.1
+    c²₁ = stable_diffusion!(model, 1, κ₁)
+    c²₂ = stable_diffusion!(model, 1, κ₂)
+    dc²_dκ_fd = (c²₂ - c²₁) / (κ₂ - κ₁)
+
+    # Now for real
+    amplitude = 1.0
+    κ = 1.0
+    dmodel = Enzyme.make_zero(model)
+    set_diffusivity!(dmodel, 0)
+
+    dc²_dκ = autodiff(Enzyme.set_runtime_activity(Enzyme.Reverse),
+                      stable_diffusion!,
+                      Duplicated(model, dmodel),
+                      Const(amplitude),
+                      Active(κ))
+
+    @info """ \n
+        Enzyme computed $dc²_dκ
+        Finite differences computed $dc²_dκ_fd
+    """
+
+    tol = 0.01
+    rel_error = abs(dc²_dκ[1][3] - dc²_dκ_fd) / abs(dc²_dκ_fd)
+    @show dc²_dκ, dc²_dκ_fd
+    @test rel_error < tol
+    
+end
+
diff --git a/test/test_forcings.jl b/test/test_forcings.jl
index 51c9b30a40..e4e4e48294 100644
--- a/test/test_forcings.jl
+++ b/test/test_forcings.jl
@@ -2,6 +2,7 @@ include("dependencies_for_runtests.jl")
 
 using Oceananigans.BoundaryConditions: ImpenetrableBoundaryCondition
 using Oceananigans.Fields: Field
+using Oceananigans.Forcings: MultipleForcings
 
 """ Take one time step with three forcing arrays on u, v, w. """
 function time_step_with_forcing_array(arch)
@@ -16,7 +17,7 @@ function time_step_with_forcing_array(arch)
     set!(Fw, (x, y, z) -> 1)
 
     model = NonhydrostaticModel(; grid, forcing=(u=Fu, v=Fv, w=Fw))
-    time_step!(model, 1, euler=true)
+    time_step!(model, 1)
 
     return true
 end
@@ -28,8 +29,8 @@ function time_step_with_forcing_functions(arch)
     @inline Fw(x, y, z, t) = 1.0
 
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
-    model = NonhydrostaticModel(grid=grid, forcing=(u=Fu, v=Fv, w=Fw))
-    time_step!(model, 1, euler=true)
+    model = NonhydrostaticModel(; grid, forcing=(u=Fu, v=Fv, w=Fw))
+    time_step!(model, 1)
 
     return true
 end
@@ -40,12 +41,10 @@ end
 
 """ Take one time step with a DiscreteForcing function. """
 function time_step_with_discrete_forcing(arch)
-
     Fu = Forcing(Fu_discrete_func, discrete_form=true)
-
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
-    model = NonhydrostaticModel(grid=grid, forcing=(u=Fu,))
-    time_step!(model, 1, euler=true)
+    model = NonhydrostaticModel(; grid, forcing=(; u=Fu))
+    time_step!(model, 1)
 
     return true
 end
@@ -53,25 +52,22 @@ end
 """ Take one time step with ParameterizedForcing forcing functions. """
 function time_step_with_parameterized_discrete_forcing(arch)
 
-    Fv = Forcing(Fv_discrete_func, parameters=(τ=60,), discrete_form=true)
-    Fw = Forcing(Fw_discrete_func, parameters=(τ=60,), discrete_form=true)
+    Fv = Forcing(Fv_discrete_func, parameters=(; τ=60), discrete_form=true)
+    Fw = Forcing(Fw_discrete_func, parameters=(; τ=60), discrete_form=true)
 
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
-    model = NonhydrostaticModel(grid=grid, forcing=(v=Fv, w=Fw))
-    time_step!(model, 1, euler=true)
+    model = NonhydrostaticModel(; grid, forcing=(v=Fv, w=Fw))
+    time_step!(model, 1)
 
     return true
 end
 
 """ Take one time step with a Forcing forcing function with parameters. """
 function time_step_with_parameterized_continuous_forcing(arch)
-
-    u_forcing = Forcing((x, y, z, t, ω) -> sin(ω * x), parameters=π)
-
+    Fu = Forcing((x, y, z, t, ω) -> sin(ω * x), parameters=π)
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
-    model = NonhydrostaticModel(grid=grid, forcing=(u=u_forcing,))
-    time_step!(model, 1, euler=true)
-
+    model = NonhydrostaticModel(; grid, forcing=(; u=Fu))
+    time_step!(model, 1)
     return true
 end
 
@@ -82,9 +78,11 @@ function time_step_with_single_field_dependent_forcing(arch, fld)
 
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
     A = Field{Center, Center, Center}(grid)
-    model = NonhydrostaticModel(grid=grid, forcing=forcing,
-                                buoyancy=SeawaterBuoyancy(), tracers=(:T, :S), auxiliary_fields=(; A))
-    time_step!(model, 1, euler=true)
+    model = NonhydrostaticModel(; grid, forcing,
+                                buoyancy = SeawaterBuoyancy(),
+                                tracers = (:T, :S),
+                                auxiliary_fields = (; A))
+    time_step!(model, 1)
 
     return true
 end
@@ -92,13 +90,16 @@ end
 """ Take one time step with a Forcing forcing function with parameters. """
 function time_step_with_multiple_field_dependent_forcing(arch)
 
-    u_forcing = Forcing((x, y, z, t, v, w, T, A) -> sin(v) * exp(w) * T *A, field_dependencies=(:v, :w, :T, :A))
+    Fu = Forcing((x, y, z, t, v, w, T, A) -> sin(v)*exp(w)*T*A, field_dependencies=(:v, :w, :T, :A))
 
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
     A = Field{Center, Center, Center}(grid)
-    model = NonhydrostaticModel(grid=grid, forcing=(u=u_forcing,),
-                                buoyancy=SeawaterBuoyancy(), tracers=(:T, :S), auxiliary_fields = (; A))
-    time_step!(model, 1, euler=true)
+    model = NonhydrostaticModel(; grid,
+                                forcing = (; u=Fu),
+                                buoyancy = SeawaterBuoyancy(),
+                                tracers = (:T, :S),
+                                auxiliary_fields = (; A))
+    time_step!(model, 1)
 
     return true
 end
@@ -106,12 +107,35 @@ end
 
 """ Take one time step with a Forcing forcing function with parameters. """
 function time_step_with_parameterized_field_dependent_forcing(arch)
+    Fu = Forcing((x, y, z, t, u, p) -> sin(p.ω * x) * u, parameters=(ω=π,), field_dependencies=:u)
+    grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
+    model = NonhydrostaticModel(; grid, forcing=(; u=Fu))
+    time_step!(model, 1)
+    return true
+end
 
-    u_forcing = Forcing((x, y, z, t, u, p) -> sin(p.ω * x) * u, parameters=(ω=π,), field_dependencies=:u)
+""" Take one time step with a FieldTimeSeries forcing function. """
+function time_step_with_field_time_series_forcing(arch)
 
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
-    model = NonhydrostaticModel(grid=grid, forcing=(u=u_forcing,))
-    time_step!(model, 1, euler=true)
+    
+    u_forcing = FieldTimeSeries{Face, Center, Center}(grid, 0:1:3)
+
+    for (t, time) in enumerate(u_forcing.times)
+        set!(u_forcing[t], (x, y, z) -> sin(π * x) * time)
+    end
+
+    model = NonhydrostaticModel(; grid, forcing=(; u=u_forcing))
+    time_step!(model, 1)
+
+    # Make sure the field time series updates correctly
+    u_forcing = FieldTimeSeries{Face, Center, Center}(grid, 0:1:4; backend = InMemory(2))
+
+    model = NonhydrostaticModel(; grid, forcing=(; u=u_forcing))
+    time_step!(model, 2)
+    time_step!(model, 2)
+    
+    @test u_forcing.backend.start == 4
 
     return true
 end
@@ -127,8 +151,8 @@ function relaxed_time_stepping(arch)
                                       target = π)
 
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
-    model = NonhydrostaticModel(grid=grid, forcing=(u=x_relax, v=y_relax, w=z_relax))
-    time_step!(model, 1, euler=true)
+    model = NonhydrostaticModel(; grid, forcing=(u=x_relax, v=y_relax, w=z_relax))
+    time_step!(model, 1)
 
     return true
 end
@@ -147,6 +171,7 @@ function advective_and_multiple_forcing(arch)
     one_forcing(x, y, z, t) = 1
 
     model = NonhydrostaticModel(; grid,
+                                timestepper = :QuasiAdamsBashforth2,
                                 tracers = (:a, :b, :c),
                                 forcing = (a = constant_slip,
                                            b = (zero_forcing, velocity_field_slip),
@@ -170,6 +195,50 @@ function advective_and_multiple_forcing(arch)
     return a_changed & b_changed & c_correct
 end
 
+function two_forcings(arch)
+    grid = RectilinearGrid(arch, size=(4, 5, 6), extent=(1, 1, 1), halo=(4, 4, 4))
+    
+    forcing1 = Relaxation(rate=1)
+    forcing2 = Relaxation(rate=2)
+
+    forcing = (u = (forcing1, forcing2),
+               v = MultipleForcings(forcing1, forcing2),
+               w = MultipleForcings((forcing1, forcing2)))
+
+    model = NonhydrostaticModel(; grid, forcing)
+    time_step!(model, 1)
+
+    return true
+end
+
+function seven_forcings(arch)
+    grid = RectilinearGrid(arch, size=(4, 5, 6), extent=(1, 1, 1), halo=(4, 4, 4))
+
+    weird_forcing(x, y, z, t) = x * y + z
+    wonky_forcing(x, y, z, t) = z / (x - y)
+    strange_forcing(x, y, z, t) = z - t
+    bizarre_forcing(x, y, z, t) = y + x
+    peculiar_forcing(x, y, z, t) = 2t / z
+    eccentric_forcing(x, y, z, t) = x + y + z + t
+    unconventional_forcing(x, y, z, t) = 10x * y
+    
+    F1 = Forcing(weird_forcing)
+    F2 = Forcing(wonky_forcing)
+    F3 = Forcing(strange_forcing)
+    F4 = Forcing(bizarre_forcing)
+    F5 = Forcing(peculiar_forcing)
+    F6 = Forcing(eccentric_forcing)
+    F7 = Forcing(unconventional_forcing)
+
+    Ft = (F1, F2, F3, F4, F5, F6, F7)
+    forcing = (u=Ft, v=MultipleForcings(Ft...), w=MultipleForcings(Ft))
+    model = NonhydrostaticModel(; grid, forcing)
+
+    time_step!(model, 1)
+
+    return true
+end
+
 @testset "Forcings" begin
     @info "Testing forcings..."
 
@@ -210,6 +279,13 @@ end
             @testset "Advective and multiple forcing [$A]" begin
                 @info "      Testing advective and multiple forcing [$A]..."
                 @test advective_and_multiple_forcing(arch)
+                @test two_forcings(arch)
+                @test seven_forcings(arch)
+            end
+
+            @testset "FieldTimeSeries forcing on [$A]" begin
+                @info "      Testing FieldTimeSeries forcing [$A]..."
+                @test time_step_with_field_time_series_forcing(arch)
             end
         end
     end
diff --git a/test/test_grids.jl b/test/test_grids.jl
index 8e008e143b..4ed536c732 100644
--- a/test/test_grids.jl
+++ b/test/test_grids.jl
@@ -918,6 +918,48 @@ end
 
         @test grid isa LatitudeLongitudeGrid
     end
+
+    @testset "Single column grids" begin
+        @info "  Testing single column grid construction..."
+
+        for arch in archs
+            for FT in float_types
+                ccc = (Center(), Center(), Center())
+                grid = RectilinearGrid(arch, FT, size=4, z=(-1, 0), topology=(Flat, Flat, Bounded))
+                x = xnodes(grid, ccc...)
+                y = ynodes(grid, ccc...)
+                @test isnothing(x)
+                @test isnothing(y)
+
+                x₀ = 1
+                y₀ = π
+                grid = RectilinearGrid(arch, FT, size=4, x=x₀, y=y₀, z=(-1, 0), topology=(Flat, Flat, Bounded))
+                x = xnodes(grid, ccc...)
+                y = ynodes(grid, ccc...)
+                @test x[1] isa FT
+                @test y[1] isa FT
+                @test x[1] == x₀
+                @test y[1] == convert(FT, y₀)
+
+                grid = LatitudeLongitudeGrid(arch, FT, size=4, z=(-1, 0), topology=(Flat, Flat, Bounded))
+                λ = λnodes(grid, ccc...)
+                φ = φnodes(grid, ccc...)
+                @test isnothing(λ)
+                @test isnothing(φ)
+
+                λ₀ = 45
+                φ₀ = 10.1
+                grid = LatitudeLongitudeGrid(arch, FT, size=4, latitude=φ₀, longitude=λ₀, z=(-1, 0),
+                                             topology=(Flat, Flat, Bounded))
+                λ = λnodes(grid, ccc...)
+                φ = φnodes(grid, ccc...)
+                @test λ[1] isa FT
+                @test φ[1] isa FT
+                @test λ[1] == λ₀
+                @test φ[1] == convert(FT, φ₀)
+            end
+        end
+    end
     
     @testset "Conformal cubed sphere face grid" begin
         @info "  Testing OrthogonalSphericalShellGrid grid..."
diff --git a/test/test_hydrostatic_free_surface_models.jl b/test/test_hydrostatic_free_surface_models.jl
index c2c48d4133..d303a3d186 100644
--- a/test/test_hydrostatic_free_surface_models.jl
+++ b/test/test_hydrostatic_free_surface_models.jl
@@ -14,16 +14,12 @@ function time_step_hydrostatic_model_works(grid;
                                            closure = nothing,
                                            velocities = nothing)
 
-    tracers = [:T, :S]
+    tracers = [:b]
+    buoyancy = BuoyancyTracer()
     closure isa CATKEVerticalDiffusivity && push!(tracers, :e)
 
-    model = HydrostaticFreeSurfaceModel(grid = grid,
-                                        momentum_advection = momentum_advection,
-                                        free_surface = free_surface,
-                                        coriolis = coriolis,
-                                        tracers = tracers,
-                                        velocities = velocities,
-                                        closure = closure)
+    model = HydrostaticFreeSurfaceModel(; grid, coriolis, tracers, velocities, buoyancy,
+                                        momentum_advection, free_surface, closure)
 
     simulation = Simulation(model, Δt=1.0, stop_iteration=1)
 
@@ -73,8 +69,8 @@ topos_3d = ((Periodic, Periodic, Bounded),
     @testset "$topo_1d model construction" begin
         @info "  Testing $topo_1d model construction..."
         for arch in archs, FT in [Float64] #float_types
-            grid = RectilinearGrid(arch, FT, topology=topo_1d, size=(1), extent=(1))
-            model = HydrostaticFreeSurfaceModel(grid=grid)
+            grid = RectilinearGrid(arch, FT, topology=topo_1d, size=1, extent=1)
+            model = HydrostaticFreeSurfaceModel(; grid)
             @test model isa HydrostaticFreeSurfaceModel
 
             # SingleColumnGrid tests
@@ -89,7 +85,7 @@ topos_3d = ((Periodic, Periodic, Bounded),
             @info "  Testing $topo model construction..."
             for arch in archs, FT in float_types
                 grid = RectilinearGrid(arch, FT, topology=topo, size=(1, 1), extent=(1, 2))
-                model = HydrostaticFreeSurfaceModel(grid=grid)
+                model = HydrostaticFreeSurfaceModel(; grid)
                 @test model isa HydrostaticFreeSurfaceModel
                 @test :η ∈ keys(fields(model)) # contrary to the SingleColumnGrid case
             end
@@ -101,7 +97,7 @@ topos_3d = ((Periodic, Periodic, Bounded),
             @info "  Testing $topo model construction..."
             for arch in archs, FT in float_types
                 grid = RectilinearGrid(arch, FT, topology=topo, size=(1, 1, 1), extent=(1, 2, 3))
-                model = HydrostaticFreeSurfaceModel(grid=grid)
+                model = HydrostaticFreeSurfaceModel(; grid)
                 @test model isa HydrostaticFreeSurfaceModel
             end
         end
diff --git a/test/test_immersed_advection.jl b/test/test_immersed_advection.jl
index fa948add2b..aaae16a899 100644
--- a/test/test_immersed_advection.jl
+++ b/test/test_immersed_advection.jl
@@ -6,14 +6,11 @@ using Oceananigans.Advection:
         _symmetric_interpolate_xᶜᵃᵃ,
         _symmetric_interpolate_yᵃᶠᵃ,
         _symmetric_interpolate_yᵃᶜᵃ,
-        _left_biased_interpolate_xᶜᵃᵃ, 
-        _left_biased_interpolate_xᶠᵃᵃ, 
-        _right_biased_interpolate_xᶜᵃᵃ,
-        _right_biased_interpolate_xᶠᵃᵃ,
-        _left_biased_interpolate_yᵃᶜᵃ, 
-        _left_biased_interpolate_yᵃᶠᵃ, 
-        _right_biased_interpolate_yᵃᶜᵃ,
-        _right_biased_interpolate_yᵃᶠᵃ
+        _biased_interpolate_xᶜᵃᵃ, 
+        _biased_interpolate_xᶠᵃᵃ, 
+        _biased_interpolate_yᵃᶜᵃ, 
+        _biased_interpolate_yᵃᶠᵃ,
+        TracerAdvection
 
 advection_schemes = [Centered, UpwindBiased, WENO]
 
@@ -26,10 +23,10 @@ function run_tracer_interpolation_test(c, ibg, scheme)
         if typeof(scheme) <: Centered
             @test CUDA.@allowscalar  _symmetric_interpolate_xᶠᵃᵃ(i+1, j, 1, ibg, scheme, c) ≈ 1.0
         else
-            @test CUDA.@allowscalar  _left_biased_interpolate_xᶠᵃᵃ(i+1, j, 1, ibg, scheme, c) ≈ 1.0
-            @test CUDA.@allowscalar _right_biased_interpolate_xᶠᵃᵃ(i+1, j, 1, ibg, scheme, c) ≈ 1.0
-            @test CUDA.@allowscalar  _left_biased_interpolate_yᵃᶠᵃ(i, j+1, 1, ibg, scheme, c) ≈ 1.0
-            @test CUDA.@allowscalar _right_biased_interpolate_yᵃᶠᵃ(i, j+1, 1, ibg, scheme, c) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_xᶠᵃᵃ(i+1, j, 1, ibg, scheme, true,  c) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_xᶠᵃᵃ(i+1, j, 1, ibg, scheme, false, c) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_yᵃᶠᵃ(i, j+1, 1, ibg, scheme, true,  c) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_yᵃᶠᵃ(i, j+1, 1, ibg, scheme, false, c) ≈ 1.0
         end
     end
 end
@@ -79,15 +76,15 @@ function run_momentum_interpolation_test(u, v, ibg, scheme)
             @test CUDA.@allowscalar  _symmetric_interpolate_yᵃᶜᵃ(i, j+1, 1, ibg, scheme, u) ≈ 1.0
             @test CUDA.@allowscalar  _symmetric_interpolate_yᵃᶜᵃ(i, j+1, 1, ibg, scheme, v) ≈ 1.0
         else
-            @test CUDA.@allowscalar  _left_biased_interpolate_xᶜᵃᵃ(i+1, j, 1, ibg, scheme, u) ≈ 1.0
-            @test CUDA.@allowscalar _right_biased_interpolate_xᶜᵃᵃ(i+1, j, 1, ibg, scheme, u) ≈ 1.0
-            @test CUDA.@allowscalar  _left_biased_interpolate_yᵃᶜᵃ(i, j+1, 1, ibg, scheme, u) ≈ 1.0
-            @test CUDA.@allowscalar _right_biased_interpolate_yᵃᶜᵃ(i, j+1, 1, ibg, scheme, u) ≈ 1.0
-
-            @test CUDA.@allowscalar  _left_biased_interpolate_xᶜᵃᵃ(i+1, j, 1, ibg, scheme, v) ≈ 1.0
-            @test CUDA.@allowscalar _right_biased_interpolate_xᶜᵃᵃ(i+1, j, 1, ibg, scheme, v) ≈ 1.0
-            @test CUDA.@allowscalar  _left_biased_interpolate_yᵃᶜᵃ(i, j+1, 1, ibg, scheme, v) ≈ 1.0
-            @test CUDA.@allowscalar _right_biased_interpolate_yᵃᶜᵃ(i, j+1, 1, ibg, scheme, v) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_xᶜᵃᵃ(i+1, j, 1, ibg, scheme, true,  u) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_xᶜᵃᵃ(i+1, j, 1, ibg, scheme, false, u) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_yᵃᶜᵃ(i, j+1, 1, ibg, scheme, true,  u) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_yᵃᶜᵃ(i, j+1, 1, ibg, scheme, false, u) ≈ 1.0
+
+            @test CUDA.@allowscalar _biased_interpolate_xᶜᵃᵃ(i+1, j, 1, ibg, scheme, true,  v) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_xᶜᵃᵃ(i+1, j, 1, ibg, scheme, false, v) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_yᵃᶜᵃ(i, j+1, 1, ibg, scheme, true,  v) ≈ 1.0
+            @test CUDA.@allowscalar _biased_interpolate_yᵃᶜᵃ(i, j+1, 1, ibg, scheme, false, v) ≈ 1.0
         end
     end
 
@@ -128,6 +125,15 @@ for arch in archs
                 run_tracer_conservation_test(g, scheme)
             end
         end
+
+        for adv in advection_schemes, buffer in [1, 2, 3, 4, 5]
+            directional_scheme = adv(order = advective_order(buffer, adv))
+            scheme = TracerAdvection(directional_scheme, directional_scheme, directional_scheme)
+            for g in [grid, ibg]
+                @info "  Testing immersed tracer conservation [$(typeof(arch)), $(summary(scheme)), $(typeof(g).name.wrapper)]"
+                run_tracer_conservation_test(g, scheme)
+            end
+        end
     end
 
     @testset "Immersed momentum reconstruction" begin
diff --git a/test/test_immersed_boundary_grid.jl b/test/test_immersed_boundary_grid.jl
new file mode 100644
index 0000000000..5be526b3be
--- /dev/null
+++ b/test/test_immersed_boundary_grid.jl
@@ -0,0 +1,16 @@
+include("dependencies_for_runtests.jl")
+
+grid = RectilinearGrid(; size=(2, 2, 2), extent = (1, 1, 1))
+
+@testset "Testing Immersed Boundaries" begin
+
+    @info "Testing the immersed boundary construction..."
+
+    bottom(x, y) = -1 + 0.5 * exp(-x^2 - y^2)
+    ibg = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom))
+
+    # Unit test (bottom is at the right position)
+
+    @info "Testing stably stratified initial conditions..."
+
+end
diff --git a/test/test_jld2_output_writer.jl b/test/test_jld2_output_writer.jl
index 535258586e..46e512bdcd 100644
--- a/test/test_jld2_output_writer.jl
+++ b/test/test_jld2_output_writer.jl
@@ -98,7 +98,7 @@ function test_jld2_time_file_splitting(arch)
     end
     ow = JLD2OutputWriter(model, (; u=model.velocities.u);
                           dir = ".",
-                          filename = "test.jld2",
+                          filename = "test",
                           schedule = IterationInterval(1),
                           init = fake_bc_init,
                           including = [:grid],
@@ -214,7 +214,7 @@ for arch in archs
         simulation.output_writers[:velocities] = JLD2OutputWriter(model, vanilla_outputs,
                                                                   schedule = IterationInterval(1),
                                                                   dir = ".",
-                                                                  filename = "vanilla_jld2_test.jld2",
+                                                                  filename = "vanilla_jld2_test",
                                                                   indices = (:, :, :),
                                                                   with_halos = false,
                                                                   overwrite_existing = true)
@@ -224,7 +224,7 @@ for arch in archs
                                                               indices = (1:2, 1:4, :),
                                                               with_halos = false,
                                                               dir = ".",
-                                                              filename = "sliced_jld2_test.jld2",
+                                                              filename = "sliced_jld2_test",
                                                               overwrite_existing = true)
 
         func_outputs = (u = model -> u, v = model -> v, w = model -> w)
@@ -234,7 +234,7 @@ for arch in archs
                                                                     indices = (1:2, 1:4, :),
                                                                     with_halos = false,
                                                                     dir = ".",
-                                                                    filename = "sliced_funcs_jld2_test.jld2",
+                                                                    filename = "sliced_funcs_jld2_test",
                                                                     overwrite_existing = true)
 
 
@@ -243,7 +243,7 @@ for arch in archs
                                                                           indices = (1:2, 1:4, :),
                                                                           with_halos = false,
                                                                           dir = ".",
-                                                                          filename = "sliced_func_fields_jld2_test.jld2",
+                                                                          filename = "sliced_func_fields_jld2_test",
                                                                           overwrite_existing = true)
 
 
diff --git a/test/test_lagrangian_particle_tracking.jl b/test/test_lagrangian_particle_tracking.jl
index 7eaa8410f7..fb60c23beb 100644
--- a/test/test_lagrangian_particle_tracking.jl
+++ b/test/test_lagrangian_particle_tracking.jl
@@ -15,8 +15,14 @@ struct TestParticle{T}
 end
 
 function particle_tracking_simulation(; grid, particles, timestepper=:RungeKutta3, velocities=nothing)
-    model = NonhydrostaticModel(; grid, timestepper, velocities, particles)
-    set!(model, u=1, v=1)
+    if grid isa RectilinearGrid
+        model = NonhydrostaticModel(; grid, timestepper, velocities, particles)
+        set!(model, u=1, v=1)
+    else
+        set!(velocities.u, 1)
+        set!(velocities.v, 1)
+        model = HydrostaticFreeSurfaceModel(; grid, velocities=PrescribedVelocityFields(; velocities...), particles)
+    end
     sim = Simulation(model, Δt=1e-2, stop_iteration=1)
 
     jld2_filepath = "test_particles.jld2"
@@ -34,7 +40,7 @@ function particle_tracking_simulation(; grid, particles, timestepper=:RungeKutta
     return sim, jld2_filepath, nc_filepath
 end
 
-function run_simple_particle_tracking_tests(grid, timestepper)
+function run_simple_particle_tracking_tests(grid, timestepper=:QuasiAdamsBashforth)
 
     arch = architecture(grid)
 
@@ -51,17 +57,19 @@ function run_simple_particle_tracking_tests(grid, timestepper)
     particles = LagrangianParticles(x=xs, y=ys, z=zs)
     @test particles isa LagrangianParticles
 
-    sim, jld2_filepath, nc_filepath = particle_tracking_simulation(; grid, particles, timestepper)
-    model = sim.model
-    run!(sim)
+    if grid isa RectilinearGrid
+        sim, jld2_filepath, nc_filepath = particle_tracking_simulation(; grid, particles, timestepper)
+        model = sim.model
+        run!(sim)
 
-    # Just test we run without errors
-    @test length(model.particles) == P
-    @test propertynames(model.particles.properties) == (:x, :y, :z)
+        # Just test we run without errors
+        @test length(model.particles) == P
+        @test propertynames(model.particles.properties) == (:x, :y, :z)
 
-    rm(jld2_filepath)
-    rm(nc_filepath)
-    rm("particles_checkpoint_iteration0.jld2")
+        rm(jld2_filepath)
+        rm(nc_filepath)
+        rm("particles_checkpoint_iteration0.jld2")
+    end
 
     #####
     ##### Test Boundary restitution
@@ -89,7 +97,7 @@ function run_simple_particle_tracking_tests(grid, timestepper)
     @test all(zᶠ .≈ (top_boundary - 0.15))
 
     #####
-    ##### Test custom particle "SpeedTrackingParticle"
+    ##### Test custom particle "TestParticle"
     #####
 
     xs = on_architecture(arch, zeros(P))
@@ -103,9 +111,8 @@ function run_simple_particle_tracking_tests(grid, timestepper)
     # Test custom constructor
     particles = StructArray{TestParticle}((xs, ys, zs, us, vs, ws, ss))
 
-    velocities = VelocityFields(grid)
-    u, v, w = velocities
-    speed = Field(√(u * u + v * v + w * w))
+    u, v, w = velocities = VelocityFields(grid)
+    speed = Field(√(u * u + v * v))
     tracked_fields = merge(velocities, (; s=speed))
 
     # applying v component of advection with background field to ensure it is included
@@ -116,29 +123,31 @@ function run_simple_particle_tracking_tests(grid, timestepper)
     lagrangian_particles = LagrangianParticles(particles; tracked_fields)
     @test lagrangian_particles isa LagrangianParticles
 
-    model = NonhydrostaticModel(; grid, timestepper,
-                                  velocities, particles=lagrangian_particles,
-                                  background_fields=(v=background_v,))
+    if grid isa RectilinearGrid
+        model = NonhydrostaticModel(; grid, timestepper,
+                                      velocities, particles=lagrangian_particles,
+                                      background_fields=(v=background_v,))
 
-    set!(model, u=1)
+        set!(model, u=1)
 
-    sim = Simulation(model, Δt=1e-2, stop_iteration=1)
+        sim = Simulation(model, Δt=1e-2, stop_iteration=1)
 
-    jld2_filepath = "test_particles.jld2"
-    sim.output_writers[:particles_jld2] =
-        JLD2OutputWriter(model, (; particles=model.particles),
-                         filename=jld2_filepath, schedule=IterationInterval(1))
+        jld2_filepath = "test_particles.jld2"
+        sim.output_writers[:particles_jld2] =
+            JLD2OutputWriter(model, (; particles=model.particles),
+                             filename=jld2_filepath, schedule=IterationInterval(1))
 
-    nc_filepath = "test_particles.nc"
-    sim.output_writers[:particles_nc] =
-        NetCDFOutputWriter(model, model.particles, filename=nc_filepath, schedule=IterationInterval(1))
+        nc_filepath = "test_particles.nc"
+        sim.output_writers[:particles_nc] =
+            NetCDFOutputWriter(model, model.particles, filename=nc_filepath, schedule=IterationInterval(1))
 
-    sim.output_writers[:checkpointer] = Checkpointer(model, schedule=IterationInterval(1),
-                                                    dir=".", prefix="particles_checkpoint")
+        sim.output_writers[:checkpointer] = Checkpointer(model, schedule=IterationInterval(1),
+                                                         dir=".", prefix="particles_checkpoint")
 
-    rm(jld2_filepath)
-    rm(nc_filepath)
-    rm("particles_checkpoint_iteration1.jld2")
+        rm(jld2_filepath)
+        rm(nc_filepath)
+        rm("particles_checkpoint_iteration1.jld2")
+    end
 
     sim, jld2_filepath, nc_filepath = particle_tracking_simulation(; grid, particles=lagrangian_particles, timestepper, velocities)
     model = sim.model
@@ -164,8 +173,10 @@ function run_simple_particle_tracking_tests(grid, timestepper)
     @test size(w) == tuple(P)
     @test size(s) == tuple(P)
 
-    @test all(x .≈ 0.01)
-    @test all(y .≈ 0.01)
+    if grid isa RectilinearGrid
+        @test all(x .≈ 0.01)
+        @test all(y .≈ 0.01)
+    end
     @test all(z .≈ 0.5)
     @test all(u .≈ 1)
     @test all(v .≈ 1)
@@ -185,8 +196,10 @@ function run_simple_particle_tracking_tests(grid, timestepper)
     @test size(w) == (P, 2)
     @test size(s) == (P, 2)
 
-    @test all(x[:, end] .≈ 0.01)
-    @test all(y[:, end] .≈ 0.01)
+    if grid isa RectilinearGrid
+        @test all(x[:, end] .≈ 0.01)
+        @test all(y[:, end] .≈ 0.01)
+    end
     @test all(z[:, end] .≈ 0.5)
     @test all(u[:, end] .≈ 1)
     @test all(v[:, end] .≈ 1)
@@ -210,8 +223,10 @@ function run_simple_particle_tracking_tests(grid, timestepper)
     @test size(file["timeseries/particles/1"].w) == tuple(P)
     @test size(file["timeseries/particles/1"].s) == tuple(P)
 
-    @test all(file["timeseries/particles/1"].x .≈ 0.01)
-    @test all(file["timeseries/particles/1"].y .≈ 0.01)
+    if grid isa RectilinearGrid
+        @test all(file["timeseries/particles/1"].x .≈ 0.01)
+        @test all(file["timeseries/particles/1"].y .≈ 0.01)
+    end
     @test all(file["timeseries/particles/1"].z .≈ 0.5)
     @test all(file["timeseries/particles/1"].u .≈ 1)
     @test all(file["timeseries/particles/1"].v .≈ 1)
@@ -250,8 +265,10 @@ function run_simple_particle_tracking_tests(grid, timestepper)
     @test size(w) == tuple(P)
     @test size(s) == tuple(P)
 
-    @test all(x .≈ 0.01)
-    @test all(y .≈ 0.01)
+    if grid isa RectilinearGrid
+        @test all(x .≈ 0.01)
+        @test all(y .≈ 0.01)
+    end
     @test all(z .≈ 0.5)
     @test all(u .≈ 1)
     @test all(v .≈ 1)
@@ -269,6 +286,22 @@ lagrangian_particle_test_grid(arch, ::Periodic, z) =
 lagrangian_particle_test_grid(arch, ::Flat, z) =
     RectilinearGrid(arch; topology=(Periodic, Flat, Bounded), size=(5, 5), x=(-1, 1), z)
 
+lagrangian_particle_test_grid_expanded(arch, ::Periodic, z) =
+    RectilinearGrid(arch; topology=(Periodic, Periodic, Bounded), size=(5, 5, 5), x=(-1, 1), y=(-1, 1), z = 2 .*z)
+lagrangian_particle_test_grid_expanded(arch, ::Flat, z) =
+    RectilinearGrid(arch; topology=(Periodic, Flat, Bounded), size=(5, 5), x=(-1, 1), z = 2 .*z)
+
+function lagrangian_particle_test_immersed_grid(arch, y_topo, z)
+    underlying_grid = lagrangian_particle_test_grid_expanded(arch, y_topo, z)
+    z_immersed_boundary(x, z) = ifelse(z < -1, true, ifelse(z > 1, true, false))
+    z_immersed_boundary(x, y, z) = z_immersed_boundary(x, z)
+    GFB = GridFittedBoundary(z_immersed_boundary)
+    return ImmersedBoundaryGrid(underlying_grid, GFB)
+end
+
+lagrangian_particle_test_curvilinear_grid(arch, z) =
+    LatitudeLongitudeGrid(arch; size=(5, 5, 5), longitude=(-1, 1), latitude=(-1, 1), z, precompute_metrics=true)
+
 @testset "Lagrangian particle tracking" begin
     timesteppers = (:QuasiAdamsBashforth2, :RungeKutta3)
     y_topologies = (Periodic(), Flat())
@@ -278,5 +311,17 @@ lagrangian_particle_test_grid(arch, ::Flat, z) =
         @info "  Testing Lagrangian particle tracking [$(typeof(arch)), $timestepper] with y $(typeof(y_topo)) on vertically $z_grid_type grid ..."
         grid = lagrangian_particle_test_grid(arch, y_topo, z)
         run_simple_particle_tracking_tests(grid, timestepper)
+
+        if z isa NTuple{2} # Test immersed regular grids
+            @info "  Testing Lagrangian particle tracking [$(typeof(arch)), $timestepper] with y $(typeof(y_topo)) on vertically $z_grid_type immersed grid ..."
+            grid = lagrangian_particle_test_immersed_grid(arch, y_topo, z)
+            run_simple_particle_tracking_tests(grid, timestepper)
+        end
+    end
+
+    for arch in archs, (z_grid_type, z) in pairs(vertical_grids)
+        @info "  Testing Lagrangian particle tracking [$(typeof(arch))] with a LatitudeLongitudeGrid with vertically $z_grid_type z coordinate ..."
+        grid = lagrangian_particle_test_curvilinear_grid(arch, z)
+        run_simple_particle_tracking_tests(grid)
     end
 end
diff --git a/test/test_matrix_poisson_solver.jl b/test/test_matrix_poisson_solver.jl
index ad878cf24b..c63ac16398 100644
--- a/test/test_matrix_poisson_solver.jl
+++ b/test/test_matrix_poisson_solver.jl
@@ -1,9 +1,7 @@
+include("dependencies_for_runtests.jl")
+
 using Oceananigans.Solvers: solve!, HeptadiagonalIterativeSolver, sparse_approximate_inverse
 using Oceananigans.Operators: volume, Δyᶠᶜᵃ, Δyᶜᶠᵃ, Δyᶜᶜᵃ, Δxᶠᶜᵃ, Δxᶜᶠᵃ, Δxᶜᶜᵃ, Δyᵃᶜᵃ, Δxᶜᵃᵃ, Δzᵃᵃᶠ, Δzᵃᵃᶜ, ∇²ᶜᶜᶜ
-using Oceananigans.Architectures: on_architecture
-using Oceananigans.Grids: architecture
-using KernelAbstractions: @kernel, @index
-using Statistics, LinearAlgebra, SparseArrays
 
 function identity_operator!(b, x)
     parent(b) .= parent(x)
diff --git a/test/test_multi_region_advection_diffusion.jl b/test/test_multi_region_advection_diffusion.jl
index 2f11e8449f..2eb496a953 100644
--- a/test/test_multi_region_advection_diffusion.jl
+++ b/test/test_multi_region_advection_diffusion.jl
@@ -1,11 +1,6 @@
 include("dependencies_for_runtests.jl")
 
-using Oceananigans.MultiRegion
-using Oceananigans.MultiRegion: reconstruct_global_field
-using Oceananigans.Operators: hack_cosd
-
 Gaussian(x, y, L) = exp(-(x^2 + y^2) / 2L^2)
-
 prescribed_velocities() = PrescribedVelocityFields(u=(λ, ϕ, z, t = 0) -> 0.1 * hack_cosd(ϕ))
 
 function Δ_min(grid)
diff --git a/test/test_multi_region_implicit_solver.jl b/test/test_multi_region_implicit_solver.jl
index e0024e43f6..64adeaa457 100644
--- a/test/test_multi_region_implicit_solver.jl
+++ b/test/test_multi_region_implicit_solver.jl
@@ -1,15 +1,5 @@
 include("dependencies_for_runtests.jl")
 
-using Oceananigans
-using Oceananigans.Units
-using Statistics
-using Printf
-using Oceananigans.MultiRegion
-using LinearAlgebra, SparseArrays
-using Oceananigans.Solvers: constructors, unpack_constructors
-using Oceananigans.Grids: architecture
-using Oceananigans.Utils
-
 function geostrophic_adjustment_test(free_surface, grid; regions = 1)
 
     if architecture(grid) isa GPU
diff --git a/test/test_multi_region_unit.jl b/test/test_multi_region_unit.jl
index c3eaeafc45..f5cc3496a3 100644
--- a/test/test_multi_region_unit.jl
+++ b/test/test_multi_region_unit.jl
@@ -1,9 +1,5 @@
 include("dependencies_for_runtests.jl")
 
-using Oceananigans.MultiRegion
-using Oceananigans.MultiRegion: reconstruct_global_grid, reconstruct_global_field, getnamewrapper
-using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid, GridFittedBottom, GridFittedBoundary
-
 devices(::CPU, num) = nothing
 devices(::GPU, num) = Tuple(0 for i in 1:num)
 
diff --git a/test/test_netcdf_output_writer.jl b/test/test_netcdf_output_writer.jl
index e70b110a55..2958d55a4f 100644
--- a/test/test_netcdf_output_writer.jl
+++ b/test/test_netcdf_output_writer.jl
@@ -14,7 +14,8 @@ using Oceananigans: Clock
 function test_DateTime_netcdf_output(arch)
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
     clock = Clock(time=DateTime(2021, 1, 1))
-    model = NonhydrostaticModel(; grid, clock, buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
+    model = NonhydrostaticModel(; grid, clock, timestepper=:QuasiAdamsBashforth2, 
+                                buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
 
     Δt = 5days + 3hours + 44.123seconds
     simulation = Simulation(model; Δt, stop_time=DateTime(2021, 2, 1))
@@ -132,7 +133,8 @@ end
 function test_TimeDate_netcdf_output(arch)
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
     clock = Clock(time=TimeDate(2021, 1, 1))
-    model = NonhydrostaticModel(; grid, clock, buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
+    model = NonhydrostaticModel(; grid, clock, timestepper=:QuasiAdamsBashforth2,
+                                buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
 
     Δt = 5days + 3hours + 44.123seconds
     simulation = Simulation(model, Δt=Δt, stop_time=TimeDate(2021, 2, 1))
@@ -435,7 +437,8 @@ function test_netcdf_function_output(arch)
     iters = 3
 
     grid = RectilinearGrid(arch, size=(Nx, Ny, Nz), extent=(L, 2L, 3L))
-    model = NonhydrostaticModel(; grid, buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
+    model = NonhydrostaticModel(; grid, timestepper=:QuasiAdamsBashforth2,
+                                buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
 
     simulation = Simulation(model, Δt=Δt, stop_iteration=iters)
     grid = model.grid
@@ -777,7 +780,7 @@ end
 
 function test_netcdf_output_alignment(arch)
     grid = RectilinearGrid(size=(1, 1, 1), extent=(1, 1, 1))
-    model = NonhydrostaticModel(grid=grid,
+    model = NonhydrostaticModel(; grid, timestepper=:QuasiAdamsBashforth2,
                                 buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
     simulation = Simulation(model, Δt=0.2, stop_time=40)
 
@@ -813,7 +816,7 @@ function test_netcdf_vertically_stretched_grid_output(arch)
     zF = [k^2 for k in 0:Nz]
     grid = RectilinearGrid(arch; size=(Nx, Ny, Nz), x=(0, 1), y=(-π, π), z=zF)
 
-    model = NonhydrostaticModel(grid=grid,
+    model = NonhydrostaticModel(; grid,
                                 buoyancy=SeawaterBuoyancy(), tracers=(:T, :S))
 
     Δt = 1.25
diff --git a/test/test_nonhydrostatic_regression.jl b/test/test_nonhydrostatic_regression.jl
index 44be406b4e..2d50afdfe2 100644
--- a/test/test_nonhydrostatic_regression.jl
+++ b/test/test_nonhydrostatic_regression.jl
@@ -47,30 +47,34 @@ include("regression_tests/ocean_large_eddy_simulation_regression_test.jl")
 @testset "Nonhydrostatic Regression" begin
     @info "Running nonhydrostatic regression tests..."
 
+    archs = nonhydrostatic_regression_test_architectures()
+
     for arch in archs
         A = typeof(arch)
 
         for grid_type in [:regular, :vertically_unstretched]
-            @testset "Thermal bubble [$A, $grid_type grid]" begin
-                @info "  Testing thermal bubble regression [$A, $grid_type grid]"
-                run_thermal_bubble_regression_test(arch, grid_type)
-            end
-
             @testset "Rayleigh–Bénard tracer [$A, $grid_type grid]]" begin
                 @info "  Testing Rayleigh–Bénard tracer regression [$A, $grid_type grid]"
                 run_rayleigh_benard_regression_test(arch, grid_type)
             end
 
-            amd_closure = (AnisotropicMinimumDissipation(), ScalarDiffusivity(ν=1.05e-6, κ=1.46e-7))
-            smag_closure = (SmagorinskyLilly(C=0.23, Cb=1, Pr=1), ScalarDiffusivity(ν=1.05e-6, κ=1.46e-7))
-
-            for closure in (amd_closure, smag_closure)
-                closurename = string(typeof(first(closure)).name.wrapper)
-                @testset "Ocean large eddy simulation [$A, $closurename, $grid_type grid]" begin
-                    @info "  Testing oceanic large eddy simulation regression [$A, $closurename, $grid_type grid]"
-                    run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closure)
+            if !(arch isa Distributed)
+                @testset "Thermal bubble [$A, $grid_type grid]" begin
+                    @info "  Testing thermal bubble regression [$A, $grid_type grid]"
+                    run_thermal_bubble_regression_test(arch, grid_type)
+                end
+            
+                amd_closure = (AnisotropicMinimumDissipation(), ScalarDiffusivity(ν=1.05e-6, κ=1.46e-7))
+                smag_closure = (SmagorinskyLilly(C=0.23, Cb=1, Pr=1), ScalarDiffusivity(ν=1.05e-6, κ=1.46e-7))
+
+                for closure in (amd_closure, smag_closure)
+                    closurename = string(typeof(first(closure)).name.wrapper)
+                    @testset "Ocean large eddy simulation [$A, $closurename, $grid_type grid]" begin
+                        @info "  Testing oceanic large eddy simulation regression [$A, $closurename, $grid_type grid]"
+                        run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closure)
+                    end
                 end
             end
         end
     end
-end
+end
\ No newline at end of file
diff --git a/test/test_output_readers.jl b/test/test_output_readers.jl
index 15e5cb982c..fd51e1ca7a 100644
--- a/test/test_output_readers.jl
+++ b/test/test_output_readers.jl
@@ -1,7 +1,7 @@
 include("dependencies_for_runtests.jl")
 
 using Oceananigans.Utils: Time
-using Oceananigans.Fields: indices
+using Oceananigans.Fields: indices, interpolate!
 using Oceananigans.OutputReaders: Cyclical, Clamp
 
 function generate_some_interesting_simulation_data(Nx, Ny, Nz; architecture=CPU())
@@ -36,16 +36,20 @@ function generate_some_interesting_simulation_data(Nx, Ny, Nz; architecture=CPU(
 
     fields_to_output = merge(model.velocities, model.tracers, computed_fields)
 
+    filepath3d = "test_3d_output_with_halos.jld2"
+    filepath2d = "test_2d_output_with_halos.jld2"
+    filepath1d = "test_1d_output_with_halos.jld2"
+
     simulation.output_writers[:jld2_3d_with_halos] =
         JLD2OutputWriter(model, fields_to_output,
-                         filename = "test_3d_output_with_halos.jld2",
+                         filename = filepath3d,
                          with_halos = true,
                          schedule = TimeInterval(30seconds),
                          overwrite_existing = true)
 
     simulation.output_writers[:jld2_2d_with_halos] =
         JLD2OutputWriter(model, fields_to_output,
-                         filename = "test_2d_output_with_halos.jld2",
+                         filename = filepath2d,
                          indices = (:, :, grid.Nz),
                          with_halos = true,
                          schedule = TimeInterval(30seconds),
@@ -55,33 +59,28 @@ function generate_some_interesting_simulation_data(Nx, Ny, Nz; architecture=CPU(
 
     simulation.output_writers[:jld2_1d_with_halos] =
         JLD2OutputWriter(model, profiles,
-                         filename = "test_1d_output_with_halos.jld2",
+                         filename = filepath1d,
                          with_halos = true,
                          schedule = TimeInterval(30seconds),
                          overwrite_existing = true)
 
     run!(simulation)
 
-    return nothing
+    return filepath1d, filepath2d, filepath3d
 end
 
 @testset "OutputReaders" begin
     @info "Testing output readers..."
 
-    Nx, Ny, Nz = 16, 10, 5
-    generate_some_interesting_simulation_data(Nx, Ny, Nz)
     Nt = 5
-
-    filepath3d = "test_3d_output_with_halos.jld2"
-    filepath2d = "test_2d_output_with_halos.jld2"
-    filepath1d = "test_1d_output_with_halos.jld2"
+    Nx, Ny, Nz = 16, 10, 5
+    filepath1d, filepath2d, filepath3d = generate_some_interesting_simulation_data(Nx, Ny, Nz)
 
     for arch in archs
         @testset "FieldTimeSeries{InMemory} [$(typeof(arch))]" begin
             @info "  Testing FieldTimeSeries{InMemory} [$(typeof(arch))]..."
 
-            ## 3D Fields
-
+            # 3D Fields
             u3 = FieldTimeSeries(filepath3d, "u", architecture=arch)
             v3 = FieldTimeSeries(filepath3d, "v", architecture=arch)
             w3 = FieldTimeSeries(filepath3d, "w", architecture=arch)
@@ -115,6 +114,7 @@ end
             ArrayType = array_type(arch)
             for fts in (u3, v3, w3, T3, b3, ζ3)
                 @test parent(fts) isa ArrayType
+                @test (fts.times isa StepRangeLen) | (fts.times isa ArrayType)
             end
 
             if arch isa CPU
@@ -123,6 +123,37 @@ end
                 @test v3[2] isa Field
             end
 
+            # Tests that we can interpolate
+            u3i = FieldTimeSeries{Face, Center, Center}(u3.grid, u3.times)
+            interpolate!(u3i, u3)
+            @test all(interior(u3i) .≈ interior(u3))
+
+            # Interpolation to a _located_ single column grid
+            grid3 = RectilinearGrid(arch, size=(3, 3, 3), x=(0.5, 3.5), y=(0.5, 3.5), z=(0.5, 3.5),
+                                    topology = (Periodic, Periodic, Bounded))
+
+            grid1 = RectilinearGrid(arch, size=3, x=1.3, y=2.7, z=(0.5, 3.5),
+                                    topology=(Flat, Flat, Bounded))
+
+            times = [1, 2]
+            c3 = FieldTimeSeries{Center, Center, Center}(grid3, times)
+            c1 = FieldTimeSeries{Center, Center, Center}(grid1, times)
+
+            for n in 1:length(times)
+                tn = times[n]
+                c₀(x, y, z) = (x + y + z) * tn
+                set!(c3[n], c₀)
+            end
+
+            interpolate!(c1, c3)
+
+            # Convert to CPU for testing
+            c11 = interior(c1[1], 1, 1, :) |> Array
+            c12 = interior(c1[2], 1, 1, :) |> Array
+
+            @test c11 ≈ [5.0, 6.0, 7.0]
+            @test c12 ≈ [10.0, 12.0, 14.0]
+
             ## 2D sliced Fields
 
             u2 = FieldTimeSeries(filepath2d, "u", architecture=arch)
@@ -318,9 +349,25 @@ end
 
             @test ds isa FieldDataset
             @test length(keys(ds.fields)) == 8
-            @test ds["u"] isa FieldTimeSeries
-            @test ds["v"][1] isa Field
-            @test ds["T"][2] isa Field
+
+            for var_str in ("u", "v", "w", "T", "S", "b", "ζ", "ke")
+                @test ds[var_str] isa FieldTimeSeries
+                @test ds[var_str][1] isa Field
+            end
+            
+            for var_sym in (:u, :v, :w, :T, :S, :b, :ζ, :ke)
+                @test ds[var_sym] isa FieldTimeSeries
+                @test ds[var_sym][2] isa Field
+            end
+
+            @test ds.u isa FieldTimeSeries
+            @test ds.v isa FieldTimeSeries
+            @test ds.w isa FieldTimeSeries
+            @test ds.T isa FieldTimeSeries
+            @test ds.S isa FieldTimeSeries
+            @test ds.b isa FieldTimeSeries
+            @test ds.ζ isa FieldTimeSeries
+            @test ds.ke isa FieldTimeSeries
         end
     end
 
diff --git a/test/test_preconditioned_conjugate_gradient_solver.jl b/test/test_preconditioned_conjugate_gradient_solver.jl
index 771c7f50c2..4f4eb0e4c6 100644
--- a/test/test_preconditioned_conjugate_gradient_solver.jl
+++ b/test/test_preconditioned_conjugate_gradient_solver.jl
@@ -10,7 +10,7 @@ end
 
 function run_identity_operator_test(grid)
     b = CenterField(grid)
-    solver = PreconditionedConjugateGradientSolver(identity_operator!, template_field = b, reltol=0, abstol=10*sqrt(eps(eltype(grid))))
+    solver = ConjugateGradientSolver(identity_operator!, template_field = b, reltol=0, abstol=10*sqrt(eps(eltype(grid))))
     initial_guess = solution = similar(b)
     set!(initial_guess, (x, y, z) -> rand())
 
@@ -33,7 +33,7 @@ function run_poisson_equation_test(grid)
     ∇²ϕ = r = CenterField(grid)
     compute_∇²!(∇²ϕ, ϕ_truth, arch, grid)
 
-    solver = PreconditionedConjugateGradientSolver(compute_∇²!, template_field=ϕ_truth, reltol=eps(eltype(grid)), maxiter=Int(1e10))
+    solver = ConjugateGradientSolver(compute_∇²!, template_field=ϕ_truth, reltol=eps(eltype(grid)), maxiter=Int(1e10))
 
     # Solve Poisson equation
     ϕ_solution = CenterField(grid)
@@ -61,9 +61,9 @@ function run_poisson_equation_test(grid)
     return nothing
 end
 
-@testset "PreconditionedConjugateGradientSolver" begin
+@testset "ConjugateGradientSolver" begin
     for arch in archs
-        @info "Testing PreconditionedConjugateGradientSolver [$(typeof(arch))]..."
+        @info "Testing ConjugateGradientSolver [$(typeof(arch))]..."
         grid = RectilinearGrid(arch, size=(4, 8, 4), extent=(1, 3, 1))
         run_identity_operator_test(grid)
         run_poisson_equation_test(grid)
diff --git a/test/test_schedules.jl b/test/test_schedules.jl
index 5b49c017cb..a01f2ffd2c 100644
--- a/test/test_schedules.jl
+++ b/test/test_schedules.jl
@@ -1,6 +1,7 @@
 include("dependencies_for_runtests.jl")
 
 using Oceananigans.Utils: TimeInterval, IterationInterval, WallTimeInterval, SpecifiedTimes
+using Oceananigans.Utils: schedule_aligned_time_step
 using Oceananigans.TimeSteppers: Clock
 using Oceananigans: initialize!
 
@@ -21,7 +22,7 @@ using Oceananigans: initialize!
     ti = TimeInterval(2)
     initialize!(ti, fake_model_at_iter_0)
 
-    @test ti.actuations == 1
+    @test ti.actuations == 0
     @test ti.interval == 2.0
     @test ti(fake_model_at_time_2)
     @test !(ti(fake_model_at_time_3))
@@ -74,4 +75,9 @@ using Oceananigans: initialize!
     # Specified times includes iteration 0
     st = SpecifiedTimes(0, 2, 4)
     @test initialize!(st, fake_model_at_iter_0)
+
+    fake_clock = (; time=2.1) 
+    st = SpecifiedTimes(2.5)
+    @test 0.4 ≈ schedule_aligned_time_step(st, fake_clock, Inf)
 end
+
diff --git a/test/test_shallow_water_models.jl b/test/test_shallow_water_models.jl
index 23dbddbc38..26c8bf85fb 100644
--- a/test/test_shallow_water_models.jl
+++ b/test/test_shallow_water_models.jl
@@ -150,7 +150,8 @@ end
     end
 
     @testset "Setting ShallowWaterModel fields" begin
-    @info "  Testing setting shallow water model fields..."
+        @info "  Testing setting shallow water model fields..."
+
         for arch in archs, FT in float_types
             N = (4,   4)
             L = (2π, 3π)
diff --git a/test/test_simulations.jl b/test/test_simulations.jl
index 05fdf43d0a..ee384cf298 100644
--- a/test/test_simulations.jl
+++ b/test/test_simulations.jl
@@ -51,9 +51,8 @@ function wall_time_step_wizard_tests(arch)
     Δt = new_time_step(Δt, wizard, model)
     @test Δt ≈ diff_CFL * Δx^2 / model.closure.ν
 
-
     grid_stretched = RectilinearGrid(arch, 
-                                    size = (1, 1, 1),
+                                     size = (1, 1, 1),
                                      x = (0, 1),
                                      y = (0, 1),
                                      z = z -> z, 
@@ -164,7 +163,7 @@ function run_simulation_date_tests(arch, start_time, stop_time, Δt)
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
 
     clock = Clock(time=start_time)
-    model = NonhydrostaticModel(; grid, clock)
+    model = NonhydrostaticModel(; grid, clock, timestepper=:QuasiAdamsBashforth2)
     simulation = Simulation(model; Δt, stop_time)
 
     @test model.clock.time == start_time
@@ -207,6 +206,19 @@ end
         @info "Testing simulations [$(typeof(arch))]..."
         run_basic_simulation_tests(arch)
 
+        # Test initialization for simulations started with iteration ≠ 0
+        grid = RectilinearGrid(arch, size=(), topology=(Flat, Flat, Flat))
+        model = NonhydrostaticModel(; grid)
+        simulation = Simulation(model; Δt=1, stop_time=6)
+        
+        progress_message(sim) = @info string("Iter: ", iteration(sim), ", time: ", prettytime(sim))
+        progress_cb = Callback(progress_message, TimeInterval(2))
+        simulation.callbacks[:progress] = progress_cb
+
+        model.clock.iteration = 1 # we want to start here for some reason
+        run!(simulation)
+        @test progress_cb.schedule.actuations == 3
+
         @testset "NaN Checker [$(typeof(arch))]" begin
             @info "  Testing NaN Checker [$(typeof(arch))]..."
             run_nan_checker_test(arch, erroring=true)
diff --git a/test/test_time_stepping.jl b/test/test_time_stepping.jl
index 9b07247248..a3fbc45bd3 100644
--- a/test/test_time_stepping.jl
+++ b/test/test_time_stepping.jl
@@ -4,14 +4,14 @@ using TimesDates: TimeDate
 using Oceananigans.Grids: topological_tuple_length, total_size
 using Oceananigans.Fields: BackgroundField
 using Oceananigans.TimeSteppers: Clock
-using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: CATKEVerticalDiffusivity
+using Oceananigans.TurbulenceClosures: CATKEVerticalDiffusivity
 
 function time_stepping_works_with_flat_dimensions(arch, topology)
     size = Tuple(1 for i = 1:topological_tuple_length(topology...))
     extent = Tuple(1 for i = 1:topological_tuple_length(topology...))
-    grid = RectilinearGrid(arch, size=size, extent=extent, topology=topology)
-    model = NonhydrostaticModel(grid=grid)
-    time_step!(model, 1, euler=true)
+    grid = RectilinearGrid(arch; size, extent, topology)
+    model = NonhydrostaticModel(; grid)
+    time_step!(model, 1)
     return true # Test that no errors/crashes happen when time stepping.
 end
 
@@ -29,11 +29,9 @@ end
 
 function time_stepping_works_with_coriolis(arch, FT, Coriolis)
     grid = RectilinearGrid(arch, FT, size=(1, 1, 1), extent=(1, 2, 3))
-    c = Coriolis(FT, latitude=45)
-    model = NonhydrostaticModel(grid=grid, coriolis=c)
-
-    time_step!(model, 1, euler=true)
-
+    coriolis = Coriolis(FT, latitude=45)
+    model = NonhydrostaticModel(; grid, coriolis)
+    time_step!(model, 1)
     return true # Test that no errors/crashes happen when time stepping.
 end
 
@@ -46,16 +44,16 @@ function time_stepping_works_with_closure(arch, FT, Closure; buoyancy=Buoyancy(m
     grid = RectilinearGrid(arch, FT; size=(3, 3, 3), halo=(3, 3, 3), extent=(1, 2, 3))
     closure = Closure(FT)
     model = NonhydrostaticModel(; grid, closure, tracers, buoyancy)
-    time_step!(model, 1, euler=true)
+    time_step!(model, 1)
 
     return true  # Test that no errors/crashes happen when time stepping.
 end
 
-function time_stepping_works_with_advection_scheme(arch, advection_scheme)
+function time_stepping_works_with_advection_scheme(arch, advection)
     # Use halo=(3, 3, 3) to accomodate WENO-5 advection scheme
     grid = RectilinearGrid(arch, size=(3, 3, 3), halo=(3, 3, 3), extent=(1, 2, 3))
-    model = NonhydrostaticModel(grid=grid, advection=advection_scheme)
-    time_step!(model, 1, euler=true)
+    model = NonhydrostaticModel(; grid, advection)
+    time_step!(model, 1)
     return true  # Test that no errors/crashes happen when time stepping.
 end
 
@@ -63,14 +61,14 @@ function time_stepping_works_with_stokes_drift(arch, stokes_drift)
     # Use halo=(3, 3, 3) to accomodate WENO-5 advection scheme
     grid = RectilinearGrid(arch, size=(3, 3, 3), halo=(3, 3, 3), extent=(1, 2, 3))
     model = NonhydrostaticModel(; grid, stokes_drift, advection=nothing)
-    time_step!(model, 1, euler=true)
+    time_step!(model, 1)
     return true  # Test that no errors/crashes happen when time stepping.
 end
 
 function time_stepping_works_with_nothing_closure(arch, FT)
     grid = RectilinearGrid(arch, FT; size=(1, 1, 1), extent=(1, 2, 3))
-    model = NonhydrostaticModel(grid=grid, closure=nothing)
-    time_step!(model, 1, euler=true)
+    model = NonhydrostaticModel(; grid, closure=nothing)
+    time_step!(model, 1)
     return true  # Test that no errors/crashes happen when time stepping.
 end
 
@@ -79,10 +77,8 @@ function time_stepping_works_with_nonlinear_eos(arch, FT, EOS)
 
     eos = EOS()
     b = SeawaterBuoyancy(equation_of_state=eos)
-
-    model = NonhydrostaticModel(grid=grid, buoyancy=b,
-                                tracers=(:T, :S))
-    time_step!(model, 1, euler=true)
+    model = NonhydrostaticModel(; grid, buoyancy=b, tracers=(:T, :S))
+    time_step!(model, 1)
 
     return true  # Test that no errors/crashes happen when time stepping.
 end
@@ -95,15 +91,17 @@ function run_first_AB2_time_step_tests(arch, FT)
     grid = RectilinearGrid(arch, FT, size=(13, 17, 19), extent=(1, 2, 3))
 
     model = NonhydrostaticModel(; grid,
+                                timestepper = :QuasiAdamsBashforth2,
                                 forcing = (; T=add_ones),
                                 buoyancy = SeawaterBuoyancy(),
                                 tracers = (:T, :S))
 
-    # Test that GT = 1 after model construction (note: this computes tendencies)
+    # Test that GT = 0 after model construction
+    # (note: model construction does not computes tendencies)
     @test all(interior(model.timestepper.Gⁿ.u) .≈ 0)
     @test all(interior(model.timestepper.Gⁿ.v) .≈ 0)
     @test all(interior(model.timestepper.Gⁿ.w) .≈ 0)
-    @test all(interior(model.timestepper.Gⁿ.T) .≈ 1)
+    @test all(interior(model.timestepper.Gⁿ.T) .≈ 0)
     @test all(interior(model.timestepper.Gⁿ.S) .≈ 0)
 
     # Test that T = 1 after 1 time step and that AB2 actually reduced to forward Euler.
@@ -215,23 +213,27 @@ function time_stepping_with_background_fields(arch)
     background_S_func(x, y, z, t, α) = α * y
     background_S = BackgroundField(background_S_func, parameters=1.2)
 
+    background_R = BackgroundField(1)
+
     background_fields = (u = background_u,
                          v = background_v,
                          w = background_w,
                          T = background_T,
-                         S = background_S)
+                         S = background_S,
+                         R = background_R)
 
     model = NonhydrostaticModel(; grid, background_fields,
                                 buoyancy = SeawaterBuoyancy(),
-                                tracers=(:T, :S))
+                                tracers=(:T, :S, :R))
 
-    time_step!(model, 1, euler=true)
+    time_step!(model, 1)
 
     return location(model.background_fields.velocities.u) === (Face, Center, Center) &&
            location(model.background_fields.velocities.v) === (Center, Face, Center) &&
            location(model.background_fields.velocities.w) === (Center, Center, Face) &&
            location(model.background_fields.tracers.T) === (Center, Center, Center) &&
-           location(model.background_fields.tracers.S) === (Center, Center, Center)
+           location(model.background_fields.tracers.S) === (Center, Center, Center) &&
+           location(model.background_fields.tracers.R) === (Nothing, Nothing, Nothing)
 end
 
 Planes = (FPlane, ConstantCartesianCoriolis, BetaPlane, NonTraditionalBetaPlane)
@@ -301,13 +303,15 @@ timesteppers = (:QuasiAdamsBashforth2, :RungeKutta3)
         @testset "Time stepping with DateTimes [$(typeof(arch)), $FT]" begin
             @info "  Testing time stepping with datetime clocks [$(typeof(arch)), $FT]"
 
-            model = NonhydrostaticModel(grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1)),
-                                        clock = Clock(time=DateTime(2020)))
+            grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1))
+            clock = Clock(time=DateTime(2020))
+            model = NonhydrostaticModel(; grid, clock, timestepper=:QuasiAdamsBashforth2)
 
             time_step!(model, 7.883)
             @test model.clock.time == DateTime("2020-01-01T00:00:07.883")
 
             model = NonhydrostaticModel(grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 1, 1)),
+                                        timestepper = :QuasiAdamsBashforth2,
                                         clock = Clock(time=TimeDate(2020)))
 
             time_step!(model, 123e-9)  # 123 nanoseconds
@@ -374,9 +378,7 @@ timesteppers = (:QuasiAdamsBashforth2, :RungeKutta3)
             for Closure in Closures
                 @info "  Testing that time stepping works [$(typeof(arch)), $FT, $Closure]..."
                 if Closure === TwoDimensionalLeith
-                    # TwoDimensionalLeith is slow on the CPU and doesn't compile right now on the GPU.
-                    # See: https://github.com/CliMA/Oceananigans.jl/pull/1074
-                    @test_skip time_stepping_works_with_closure(arch, FT, Closure)
+                    @test time_stepping_works_with_closure(arch, FT, Closure)
                 elseif Closure === CATKEVerticalDiffusivity
                     # CATKE isn't supported with NonhydrostaticModel yet
                     @test_skip time_stepping_works_with_closure(arch, FT, Closure)
diff --git a/test/test_turbulence_closures.jl b/test/test_turbulence_closures.jl
index 747f60171c..d2e5cafb95 100644
--- a/test/test_turbulence_closures.jl
+++ b/test/test_turbulence_closures.jl
@@ -104,21 +104,17 @@ function time_step_with_variable_isotropic_diffusivity(arch)
 
     model = NonhydrostaticModel(; grid, closure)
 
-    time_step!(model, 1, euler=true)
+    time_step!(model, 1)
     return true
 end
 
 function time_step_with_field_isotropic_diffusivity(arch)
     grid = RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 2, 3))
-
     ν = CenterField(grid)
     κ = CenterField(grid)
-
     closure = ScalarDiffusivity(; ν, κ)
-
     model = NonhydrostaticModel(; grid, closure)
-
-    time_step!(model, 1, euler=true)
+    time_step!(model, 1)
     return true
 end
 
@@ -130,7 +126,7 @@ function time_step_with_variable_anisotropic_diffusivity(arch)
                                        κ = (x, y, z, t) -> exp(z) * cos(x) * cos(y) * cos(t))
     for clo in (clov, cloh)
         model = NonhydrostaticModel(grid=RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 2, 3)), closure=clo)
-        time_step!(model, 1, euler=true)
+        time_step!(model, 1)
     end
 
     return true
@@ -143,9 +139,11 @@ function time_step_with_variable_discrete_diffusivity(arch)
     closure_ν = ScalarDiffusivity(ν = νd, discrete_form=true, loc = (Face, Center, Center))
     closure_κ = ScalarDiffusivity(κ = κd, discrete_form=true, loc = (Center, Face, Center))
 
-    model = NonhydrostaticModel(grid=RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 2, 3)), tracers = (:T, :S), closure=(closure_ν, closure_κ))
+    model = NonhydrostaticModel(grid=RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 2, 3)),
+                                tracers = (:T, :S),
+                                closure = (closure_ν, closure_κ))
 
-    time_step!(model, 1, euler=true)
+    time_step!(model, 1)
     return true
 end
 
@@ -155,7 +153,7 @@ function time_step_with_tupled_closure(FT, arch)
     model = NonhydrostaticModel(closure=closure_tuple,
                                 grid=RectilinearGrid(arch, FT, size=(2, 2, 2), extent=(1, 2, 3)))
 
-    time_step!(model, 1, euler=true)
+    time_step!(model, 1)
     return true
 end
 
@@ -278,20 +276,20 @@ end
 
             @info "    Testing time-stepping CATKE in a 2-tuple with HorizontalScalarDiffusivity..."
             closure = (CATKEVerticalDiffusivity(), HorizontalScalarDiffusivity())
-            #model = run_time_step_with_catke_tests(arch, closure)
-            @test_skip first(model.closure) === closure[1]
+            model = run_time_step_with_catke_tests(arch, closure)
+            @test first(model.closure) === closure[1]
 
             # Test that closure tuples with CATKE are correctly reordered
             @info "    Testing time-stepping CATKE in a 2-tuple with HorizontalScalarDiffusivity..."
             closure = (HorizontalScalarDiffusivity(), CATKEVerticalDiffusivity())
-            #model = run_time_step_with_catke_tests(arch, closure)
-            @test_skip first(model.closure) === closure[2]
+            model = run_time_step_with_catke_tests(arch, closure)
+            @test first(model.closure) === closure[2]
 
             # These are slow to compile...
             @info "    Testing time-stepping CATKE in a 3-tuple..."
             closure = (HorizontalScalarDiffusivity(), CATKEVerticalDiffusivity(), VerticalScalarDiffusivity())
-            #model = run_time_step_with_catke_tests(arch, closure)
-            @test_skip first(model.closure) === closure[2]
+            model = run_time_step_with_catke_tests(arch, closure)
+            @test first(model.closure) === closure[2]
         end
     end
 
diff --git a/test/utils_for_runtests.jl b/test/utils_for_runtests.jl
index d4a92035e2..7bc06e528e 100644
--- a/test/utils_for_runtests.jl
+++ b/test/utils_for_runtests.jl
@@ -1,22 +1,16 @@
-using Test
-using Statistics
-using CUDA
-using Printf
-using MPI
-using KernelAbstractions: @kernel, @index
-
-using Oceananigans
 using Oceananigans.TimeSteppers: QuasiAdamsBashforth2TimeStepper, RungeKutta3TimeStepper, update_state!
 using Oceananigans.DistributedComputations: Distributed, Partition, child_architecture, Fractional, Equal
 
 import Oceananigans.Fields: interior
 
+test_child_arch() = CUDA.has_cuda() ? GPU() : CPU()
+
 function test_architectures() 
-    child_arch =  CUDA.has_cuda() ? GPU() : CPU()
+    child_arch =  test_child_arch()
 
     # If MPI is initialized with MPI.Comm_size > 0, we are running in parallel.
-    # We test 3 different configurations: `Partition(x = 4)`, `Partition(y = 4)` 
-    # and `Partition(x = 4, y = 4)`
+    # We test several different configurations: `Partition(x = 4)`, `Partition(y = 4)`, 
+    # `Partition(x = 2, y = 2)`, and different fractional subdivisions in x, y and xy
     if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4
         return (Distributed(child_arch; partition = Partition(4)),
                 Distributed(child_arch; partition = Partition(1, 4)),
@@ -29,6 +23,23 @@ function test_architectures()
     end
 end
 
+# For nonhydrostatic simulations we cannot use `Fractional` at the moment (requirements
+# for the tranpose are more stringent than for hydrostatic simulations).
+function nonhydrostatic_regression_test_architectures() 
+    child_arch =  test_child_arch()
+
+    # If MPI is initialized with MPI.Comm_size > 0, we are running in parallel.
+    # We test 3 different configurations: `Partition(x = 4)`, `Partition(y = 4)` 
+    # and `Partition(x = 2, y = 2)`
+    if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4
+        return (Distributed(child_arch; partition = Partition(4)),
+                Distributed(child_arch; partition = Partition(1, 4)),
+                Distributed(child_arch; partition = Partition(2, 2)))
+    else
+        return tuple(child_arch)
+    end
+end
+
 function summarize_regression_test(fields, correct_fields)
     for (field_name, φ, φ_c) in zip(keys(fields), fields, correct_fields)
         Δ = φ .- φ_c
@@ -87,8 +98,7 @@ end
 
 function compute_∇²!(∇²ϕ, ϕ, arch, grid)
     fill_halo_regions!(ϕ)
-    child_arch = child_architecture(arch)
-    launch!(child_arch, grid, :xyz, ∇²!, ∇²ϕ, grid, ϕ)
+    launch!(arch, grid, :xyz, ∇²!, ∇²ϕ, grid, ϕ)
     fill_halo_regions!(∇²ϕ)
     return nothing
 end
diff --git a/validation/advection/validate_one_dimensional_advection.jl b/validation/advection/validate_one_dimensional_advection.jl
index f04c3c3661..f8a8cfbfaa 100644
--- a/validation/advection/validate_one_dimensional_advection.jl
+++ b/validation/advection/validate_one_dimensional_advection.jl
@@ -1,5 +1,5 @@
 using Oceananigans
-using Oceananigans.Advection: AbstractCenteredAdvectionScheme, AbstractUpwindBiasedAdvectionScheme, VelocityStencil, VorticityStencil
+using Oceananigans.Advection: AbstractCenteredAdvectionScheme, AbstractUpwindBiasedAdvectionScheme, VelocityStencil
 using Oceananigans.Models.ShallowWaterModels: VectorInvariantFormulation, ConservativeFormulation
 using JLD2
 using OffsetArrays
@@ -45,8 +45,8 @@ solution  = Dict()
 error     = Dict()
 
 # 1D grid constructions
-grid_reg = RectilinearGrid(arch, size = (Nx, 1), x = Freg,  y = (0, 1), halo = (7, 7), topology = (Periodic, Periodic, Flat))
-grid_str = RectilinearGrid(arch, size = (Nx, 1), x = Fstr,  y = (0, 1), halo = (7, 7), topology = (Periodic, Periodic, Flat))
+grid_reg = RectilinearGrid(arch, size = Nx, x = Freg, halo = 7, topology = (Periodic, Flat, Flat))
+grid_str = RectilinearGrid(arch, size = Nx, x = Fstr, halo = 7, topology = (Periodic, Flat, Flat))
 
 # the initial condition
 @inline G(x, β, z) = exp(-β*(x - z)^2)
@@ -58,7 +58,7 @@ Z = -0.7
 a = 0.5
 α = 10
 
-@inline function c₀_1D(x, y, z) 
+@inline function c₀_1D(x) 
     if x <= -0.6 && x >= -0.8
         return 1/6*(G(x, β, Z-δ) + 4*G(x, β, Z) + G(x, β, Z+δ))
     elseif x <= -0.2 && x >= -0.4
@@ -78,8 +78,8 @@ Schemes = [:Centered, :UpwindBiased, :WENO]
 @inline grid_or_not(::Nothing) = -1
 
 # # Checking the accuracy of different schemes with different settings
-buffers = [2, 3]
-for (gr, grid) in enumerate([grid_str])
+buffers = [2, 3, 4, 5]
+for (gr, grid) in enumerate([grid_reg])
     
     @info "testing grid number $gr"
 
@@ -92,8 +92,9 @@ for (gr, grid) in enumerate([grid_str])
     c_real = CenterField(grid)
     formulation = ConservativeFormulation()
     
-    for Scheme in [Schemes[2]]
-        for buffer in buffers, gr in (nothing, grid)
+    for Scheme in [Schemes[3]]
+        for buffer in buffers
+            gr = nothing
 
             scheme     = eval(Scheme)(gr, order = advection_order(buffer, eval(Scheme)))
             scheme_mom = eval(Scheme)(gr, order = advection_order(buffer, eval(Scheme))) #, vector_invariant = form(formulation))
@@ -107,9 +108,9 @@ for (gr, grid) in enumerate([grid_str])
                                     formulation)
 
             if formulation isa VectorInvariantFormulation
-                set!(model, h=1.0, u=1.0, v=c₀_1D)
+                set!(model, h=1.0, u=-1.0, v=c₀_1D)
             else
-                set!(model, h=1.0, uh=1.0, vh=c₀_1D)
+                set!(model, h=1.0, uh=-1.0, vh=c₀_1D)
             end
 
             set!(model, c=c₀_1D)
@@ -124,7 +125,7 @@ for (gr, grid) in enumerate([grid_str])
                 solution[(buffer, Int(i), grid_or_not(gr))] = csim
             end
 
-            c_sol(x, y, z) = @. c₀_1D(x - model.clock.time + 2, y, z)
+            c_sol(x) = @. c₀_1D(x - model.clock.time + 2)
             set!(c_real, c_sol)
         end
 
@@ -137,10 +138,10 @@ for (gr, grid) in enumerate([grid_str])
             plot!(x, solution[(buffers[2], Int(i), -1.0)], ylims = (-0.3, 1.3), linewidth = 1, linecolor =:blue , legend = false) 
             plot!(x, solution[(buffers[3], Int(i), -1.0)], ylims = (-0.3, 1.3), linewidth = 1, linecolor =:green, legend = false)
             plot!(x, solution[(buffers[4], Int(i), -1.0)], ylims = (-0.3, 1.3), linewidth = 1, linecolor =:yellow, legend = false)
-            plot!(x, solution[(buffers[1], Int(i), 1.0)], ylims = (-0.3, 1.3), linestyle=:dash, linewidth = 1, linecolor =:red  , legend = false) 
-            plot!(x, solution[(buffers[2], Int(i), 1.0)], ylims = (-0.3, 1.3), linestyle=:dash, linewidth = 1, linecolor =:blue , legend = false) 
-            plot!(x, solution[(buffers[3], Int(i), 1.0)], ylims = (-0.3, 1.3), linestyle=:dash, linewidth = 1, linecolor =:green, legend = false)
-            plot!(x, solution[(buffers[4], Int(i), 1.0)], ylims = (-0.3, 1.3), linestyle=:dash, linewidth = 1, linecolor =:yellow, legend = false)
+            # plot!(x, solution[(buffers[1], Int(i), 1.0)], ylims = (-0.3, 1.3), linestyle=:dash, linewidth = 1, linecolor =:red  , legend = false) 
+            # plot!(x, solution[(buffers[2], Int(i), 1.0)], ylims = (-0.3, 1.3), linestyle=:dash, linewidth = 1, linecolor =:blue , legend = false) 
+            # plot!(x, solution[(buffers[3], Int(i), 1.0)], ylims = (-0.3, 1.3), linestyle=:dash, linewidth = 1, linecolor =:green, legend = false)
+            # plot!(x, solution[(buffers[4], Int(i), 1.0)], ylims = (-0.3, 1.3), linestyle=:dash, linewidth = 1, linecolor =:yellow, legend = false)
         end
         mp4(anim, "anim_1D_$(gr)_$(Scheme).mp4", fps = 15)
     end
diff --git a/validation/biogeochemistry/simple_plankton_continuous_form_biogeochemistry.jl b/validation/biogeochemistry/simple_plankton_continuous_form_biogeochemistry.jl
index 5ca59253b8..28eca3083c 100644
--- a/validation/biogeochemistry/simple_plankton_continuous_form_biogeochemistry.jl
+++ b/validation/biogeochemistry/simple_plankton_continuous_form_biogeochemistry.jl
@@ -115,13 +115,12 @@ end
 # Call the integration
 @inline function update_biogeochemical_state!(bgc::SimplePlanktonGrowthDeath, model)
     arch = architecture(model.grid)
-    event = launch!(arch, model.grid, :xy, update_PhotosyntheticallyActiveRatiation!, 
-                    bgc,
-                    model.tracers.P, 
-                    bgc.PAR,
-                    model.grid, 
-                    model.clock.time)
-    wait(event)
+    launch!(arch, model.grid, :xy, update_PhotosyntheticallyActiveRatiation!, 
+            bgc,
+            model.tracers.P, 
+            bgc.PAR,
+            model.grid, 
+            model.clock.time)
 end
 
 #####
diff --git a/validation/distributed_simulations/distributed_geostrophic_adjustment.jl b/validation/distributed_simulations/distributed_geostrophic_adjustment.jl
index 6bcbc5fd8a..a7dc5a7359 100644
--- a/validation/distributed_simulations/distributed_geostrophic_adjustment.jl
+++ b/validation/distributed_simulations/distributed_geostrophic_adjustment.jl
@@ -11,6 +11,7 @@
 using MPI
 using Oceananigans
 using Oceananigans.DistributedComputations
+using Oceananigans.DistributedComputations: Sizes
 using Oceananigans.Grids: topology, architecture
 using Oceananigans.Units: kilometers, meters
 using Printf
@@ -18,9 +19,9 @@ using JLD2
 
 topo = (Bounded, Periodic, Bounded)
 
-partition = Partition([10, 13, 18, 39])
+partition = Partition(x = Sizes(10, 13, 18, 39))
 
-arch = Distributed(CPU(); topology = topo, partition)
+arch = Distributed(CPU(); partition)
 
 # Distribute problem irregularly
 Nx = 80
@@ -30,7 +31,7 @@ Lh = 100kilometers
 Lz = 400meters
 
 grid = RectilinearGrid(arch,
-                       size = (Nx, 3, 1),
+                       size = (Nx, 3, 2),
                        x = (0, Lh),
                        y = (0, Lh),
                        z = (-Lz, 0),
@@ -39,11 +40,13 @@ grid = RectilinearGrid(arch,
 @show rank, grid
 
 bottom(x, y) = x > 80kilometers && x < 90kilometers ? 100 : -500meters
-grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom), true)
+grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom); active_cells_map = true)
+
+coriolis = FPlane(f=1e-4)
 
 model = HydrostaticFreeSurfaceModel(; grid,
-                                    coriolis = FPlane(f=1e-4),
-                                    free_surface = SplitExplicitFreeSurface(grid; substeps=10))
+                                      coriolis,
+                                      free_surface = SplitExplicitFreeSurface(grid; substeps=10))
 
 gaussian(x, L) = exp(-x^2 / 2L^2)
 
diff --git a/validation/distributed_simulations/distributed_hydrostatic_turbulence.jl b/validation/distributed_simulations/distributed_hydrostatic_turbulence.jl
index 44f1d2b0ab..6e88bdd049 100644
--- a/validation/distributed_simulations/distributed_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/distributed_hydrostatic_turbulence.jl
@@ -7,7 +7,6 @@ using Oceananigans.BoundaryConditions
 using Oceananigans.DistributedComputations    
 using Random
 using JLD2
-using Oceananigans.ImmersedBoundaries: ActiveCellsIBG, active_interior_map
 
 # Run with 
 #
diff --git a/validation/distributed_simulations/distributed_nonhydrostatic_turbulence.jl b/validation/distributed_simulations/distributed_nonhydrostatic_turbulence.jl
index 6c93d86cc5..3d8428b164 100644
--- a/validation/distributed_simulations/distributed_nonhydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/distributed_nonhydrostatic_turbulence.jl
@@ -1,42 +1,40 @@
 # Run this script with
 #
-# $ mpiexec -n 2 julia --project mpi_nonhydrostatic_two_dimensional_turbulence.jl
+# $ mpirun -n 2 julia --project distributed_nonhydrostatic_turbulence.jl
 #
 # for example.
 #
 # You also probably should set
 #
 # $ export JULIA_NUM_THREADS=1
+#
+# See MPI.jl documentation for more information on how to setup the MPI environment.
+# If you have a local installation of MPI, you can use it by setting
+# 
+# julia> MPIPreferences.use_system_binaries()
+#
+# before running the script.
 
 using MPI
 using Oceananigans
 using Oceananigans.DistributedComputations
 using Statistics
 using Printf
-using Logging
-
-Logging.global_logger(OceananigansLogger())
-
-comm = MPI.COMM_WORLD
-rank = MPI.Comm_rank(comm)
-Nranks = MPI.Comm_size(comm)
-
-@info "Running on rank $rank of $Nranks..."
+using Random
 
 Nx = Ny = 256
 Lx = Ly = 2π
 topology = (Periodic, Periodic, Flat)
-arch = Distributed(CPU(); topology, ranks=(1, Nranks, 1), communicator=comm)
-grid = RectilinearGrid(arch; topology, size=(Nx ÷ Nranks, Ny), halo=(3, 3), x=(0, 2π), y=(0, 2π))
+arch = Distributed(CPU())
+grid = RectilinearGrid(arch; topology, size=(Nx, Ny), halo=(3, 3), x=(0, 2π), y=(0, 2π))
 
-@info "Built $Nranks grids:"
 @show grid
 
 model = NonhydrostaticModel(; grid, advection=WENO(), closure=ScalarDiffusivity(ν=1e-4, κ=1e-4))
 
-# This doesn't work?
-# ϵ(x, y, z) = 2rand() - 1 # ∈ (-1, 1)
-# set!(model, u=ϵ, v=ϵ)
+# Make sure we use different seeds for different cores.
+rank = arch.local_rank
+Random.seed!((rank+ 1) * 1234)
 
 uᵢ = rand(size(grid)...)
 vᵢ = rand(size(grid)...)
@@ -54,23 +52,22 @@ compute!(ζ)
 simulation = Simulation(model, Δt=0.01, stop_iteration=1000)
 
 function progress(sim)
-    comm = sim.model.grid.architecture.communicator
-    rank = MPI.Comm_rank(comm)
+    rank = sim.model.grid.architecture.local_rank
     compute!(ζ)
     compute!(e)
 
     rank == 0 && @info(string("Iteration: ", iteration(sim), ", time: ", prettytime(sim)))
 
     @info @sprintf("Rank %d: max|ζ|: %.2e, max(e): %.2e",
-                   MPI.Comm_rank(comm), maximum(abs, ζ), maximum(abs, e))
+                   rank, maximum(abs, ζ), maximum(abs, e))
 
     return nothing
 end
 
 simulation.callbacks[:progress] = Callback(progress, IterationInterval(10))
 
-rank = MPI.Comm_rank(arch.communicator)
 outputs = merge(model.velocities, (; e, ζ))
+
 simulation.output_writers[:fields] = JLD2OutputWriter(model, outputs,
                                                       schedule = TimeInterval(0.1),
                                                       with_halos = true,
diff --git a/validation/distributed_simulations/distributed_scaling/distributed_hydrostatic_simulation.jl b/validation/distributed_simulations/distributed_scaling/distributed_hydrostatic_simulation.jl
new file mode 100644
index 0000000000..87621c1bff
--- /dev/null
+++ b/validation/distributed_simulations/distributed_scaling/distributed_hydrostatic_simulation.jl
@@ -0,0 +1,103 @@
+using MPI
+MPI.Init()
+
+using JLD2
+using Statistics: mean
+using Printf
+using Oceananigans
+using Oceananigans.Utils: prettytime
+using Oceananigans.DistributedComputations
+using Oceananigans.Grids: node
+using Oceananigans.Advection: cell_advection_timescale
+using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: CATKEVerticalDiffusivity
+using Oceananigans.Units
+using SeawaterPolynomials.TEOS10: TEOS10EquationOfState
+
+function double_drake_bathymetry(λ, φ) 
+    if φ > -35
+        (λ >  0 && λ < 1)  && return 0.0
+        (λ > 90 && λ < 91) && return 0.0
+    end
+
+    return -10000.0
+end
+
+function run_hydrostatic_simulation!(grid_size, ranks, FT::DataType = Float64; 
+                                     output_name = nothing,
+                                     timestepper = :QuasiAdamsBashforth2,
+                                     CFL = 0.35,
+                                     barotropic_CFL = 0.75)
+
+    arch  = Distributed(GPU(), FT; partition = Partition(ranks...))
+    grid  = LatitudeLongitudeGrid(arch; size = grid_size, longitude = (-180, 180),
+                                  latitude = (-75, 75),
+                                  z = (-5500, 0),
+                                  halo = (7, 7, 7))
+
+    grid  = ImmersedBoundaryGrid(grid, GridFittedBottom(double_drake_bathymetry)) 
+
+    momentum_advection = WENOVectorInvariant(FT)
+    tracer_advection   = WENO(grid, order = 7)
+
+    buoyancy = SeawaterBuoyancy(FT; equation_of_state = TEOS10EquationOfState(FT))
+    coriolis = HydrostaticSphericalCoriolis(FT)
+    closure  = CATKEVerticalDiffusivity(FT)
+
+    max_Δt = 45 * 48 / grid.Δλᶠᵃᵃ
+
+    free_surface = SplitExplicitFreeSurface(FT; grid, cfl = barotropic_CFL, fixed_Δt = max_Δt)
+
+    model = HydrostaticFreeSurfaceModel(; grid, 
+                                          momentum_advection,
+                                          tracer_advection,
+                                          coriolis,
+                                          closure,
+                                          free_surface,
+                                          tracers = (:T, :S, :e),
+                                          buoyancy,
+                                          timestepper)
+
+    wtime = Ref(time_ns())
+
+    function progress(sim)
+        @info @sprintf("iteration: %d, Δt: %2e, wall time: %s (|u|, |v|, |w|): %.2e %.2e %.2e, b: %.2e \n",
+              sim.model.clock.iteration, sim.Δt, prettytime((time_ns() - wtime[])*1e-9),
+              maximum(abs, sim.model.velocities.u), maximum(abs, sim.model.velocities.v),
+              maximum(abs, sim.model.velocities.w), maximum(abs, sim.model.tracers.b))
+       wtime[] = time_ns()
+    end
+
+    simulation = Simulation(model; Δt=max_Δt, stop_time = 20days, stop_iteration = 100)
+
+    # Adaptive time-stepping
+    wizard = TimeStepWizard(cfl=CFL; max_change=1.1, min_Δt=10, max_Δt)
+    simulation.callbacks[:wizard] = Callback(wizard, IterationInterval(10))
+    simulation.callbacks[:progress] = Callback(progress, IterationInterval(10))
+
+    rank = MPI.Comm_rank(MPI.COMM_WORLD)
+
+    if !isnothing(output_name)
+        simulation.output_writers[:fields] = JLD2OutputWriter(model, merge(model.velocities, model.tracers),
+                                                            filename = output_name * "_$(rank)",
+                                                            schedule = TimeInterval(1day),
+                                                            overwrite_existing = true)
+    end
+
+    run!(simulation)
+
+    return nothing
+end
+
+rx = parse(Int, get(ENV, "RX", "1"))
+ry = parse(Int, get(ENV, "RY", "1"))
+
+ranks = (rx, ry, 1)
+
+Nx = parse(Int, get(ENV, "NX", "1440"))
+Ny = parse(Int, get(ENV, "NY", "600"))
+Nz = parse(Int, get(ENV, "NZ", "100"))
+
+grid_size = (Nx, Ny, Nz)
+
+@info "Running HydrostaticFreeSurface model with ranks $ranks and grid size $grid_size"
+run_hydrostatic_simulation!(grid_size, ranks)
diff --git a/validation/distributed_simulations/distributed_scaling/distributed_nonhydrostatic_simulation.jl b/validation/distributed_simulations/distributed_scaling/distributed_nonhydrostatic_simulation.jl
new file mode 100644
index 0000000000..7a963edc4d
--- /dev/null
+++ b/validation/distributed_simulations/distributed_scaling/distributed_nonhydrostatic_simulation.jl
@@ -0,0 +1,103 @@
+using MPI
+MPI.Init()
+
+using JLD2
+using Statistics: mean
+using Printf
+using Oceananigans
+using Oceananigans.Utils: prettytime
+using Oceananigans.DistributedComputations
+using Oceananigans.Grids: node
+using Oceananigans.Advection: cell_advection_timescale
+using Oceananigans.Units
+
+@inline function bᵢ(x, y, z, p) 
+    b = - 1 / (p.Ly)^2 * y^2 + 1
+    return p.N² * z + p.Δb * b
+end
+
+function run_nonhydrostatic_simulation!(grid_size, ranks; 
+                                        topology  = (Periodic, Periodic, Bounded),
+                                        output_name = nothing, 
+                                        timestepper = :QuasiAdamsBashforth2,
+                                        CFL = 0.5)
+        
+    arch  = Distributed(GPU(); partition = Partition(ranks...))
+    grid  = RectilinearGrid(arch; size = grid_size, x = (0, 4096),
+			    		    y = (-2048, 2048),
+					        z = (-512, 0), topology,
+                            halo = (6, 6, 6))
+
+    N² = 4e-6
+    Δb = 0.001
+    Ly = 1024
+
+    b_bcs = FieldBoundaryConditions(top = FluxBoundaryCondition(5e-9), 
+                                    bottom = GradientBoundaryCondition(N²))
+
+    @inline function b_restoring(i, j, k, grid, clock, fields, p)
+        @inbounds begin
+            x, y, z = node(i, j, k, grid, Center(), Center(), Center())
+            return 1 / p.λ * (bᵢ(x, y, z, p) - fields.b[i, j, k])
+        end
+    end
+                                    
+    params = (; N², Δb, Ly, λ = 10days)
+
+    model = NonhydrostaticModel(; grid, 
+                                  advection = WENO(order = 9), 
+                                  coriolis = FPlane(f = -1e-5),
+				                  tracers = :b, 
+                                  buoyancy = BuoyancyTracer(),
+                                  boundary_conditions = (; b = b_bcs),
+                                  timestepper)
+    
+    @inline bᵣ(x, y, z) = bᵢ(x, y, z, params) + Δb * rand() / 1000
+    @inline uᵣ(x, y, z) = (rand() - 0.5) * 0.001
+    
+    set!(model, u = uᵣ, v = uᵣ, b = bᵣ)
+    
+    wtime = Ref(time_ns())
+    
+    function progress(sim) 
+        @info @sprintf("iteration: %d, Δt: %2e, wall time: %s (|u|, |v|, |w|): %.2e %.2e %.2e, b: %.2e \n", 
+              sim.model.clock.iteration, sim.Δt, prettytime((time_ns() - wtime[])*1e-9),
+              maximum(abs, sim.model.velocities.u), maximum(abs, sim.model.velocities.v), 
+              maximum(abs, sim.model.velocities.w), maximum(abs, sim.model.tracers.b))
+       wtime[] = time_ns()
+    end
+
+    simulation = Simulation(model; Δt=1.0, stop_time = 20days, stop_iteration = 100)
+                        
+    # Adaptive time-stepping
+    wizard = TimeStepWizard(cfl=CFL, max_change=1.1, min_Δt=0.5, max_Δt=60.0)
+    simulation.callbacks[:wizard] = Callback(wizard, IterationInterval(10))
+    simulation.callbacks[:progress] = Callback(progress, IterationInterval(10))
+   
+    rank = MPI.Comm_rank(MPI.COMM_WORLD)
+
+    if !isnothing(output_name)
+        simulation.output_writers[:fields] = JLD2OutputWriter(model, merge(model.velocities, model.tracers),
+                                                            filename = output_name * "_$(rank)",
+                                                            schedule = TimeInterval(1hour),
+                                                            overwrite_existing = true)
+    end
+    
+    run!(simulation)
+
+    return nothing
+end
+
+rx = parse(Int, get(ENV, "RX", "1"))
+ry = parse(Int, get(ENV, "RY", "1"))
+
+ranks = (rx, ry, 1)
+
+Nx = parse(Int, get(ENV, "NX", "256"))
+Ny = parse(Int, get(ENV, "NY", "256"))
+Nz = parse(Int, get(ENV, "NZ", "256"))
+
+grid_size = (Nx, Ny, Nz)
+
+@info "Running Nonhydrostatic model with ranks $ranks and grid size $grid_size"
+run_nonhydrostatic_simulation!(grid_size, ranks)
\ No newline at end of file
diff --git a/validation/distributed_simulations/distributed_scaling/job_script.sh b/validation/distributed_simulations/distributed_scaling/job_script.sh
new file mode 100644
index 0000000000..9b0a3647f5
--- /dev/null
+++ b/validation/distributed_simulations/distributed_scaling/job_script.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=500GB
+#SBATCH --time 24:00:00
+#SBATCH -o output_${SIMULATION}_RX${RX}_RY${RY}_NX${NX}_NY${NY}
+#SBATCH -e error_${SIMULATION}_RX${RX}_RY${RY}_NX${NX}_NY${NY}
+
+## modules setup
+# Upload modules: cuda and cuda-aware mpi
+module purge all
+module add spack
+# Example:
+# module add cuda/11.4
+# module load openmpi/3.1.6-cuda-pmi-ucx-slurm-jhklron
+
+# MPI specific exports (usually not needed)
+# export OMPI_MCA_pml=^ucx
+# export OMPI_MCA_osc=^ucx
+# export OMPI_MCA_btl_openib_allow_ib=true
+
+# Number of threads in SLURM mode
+export JULIA_NUM_THREADS=${SLURM_CPUS_PER_TASK:=1}
+
+# Julia specific enviromental variables
+export JULIA_NVTX_CALLBACKS=gc
+export JULIA_CUDA_MEMORY_POOL=none
+
+cat > launch.sh << EoF_s
+#! /bin/sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+exec \$*
+EoF_s
+chmod +x launch.sh
+
+# Add an NSYS trace only if the system has it
+if test $PROFILE_TRACE == 1; then
+   NSYS="nsys profile --trace=nvtx,cuda,mpi --output=${SIMULATION}_RX${RX}_RY${RY}_NX${NX}_NY${NY}"
+fi
+
+if test $SIMULATION = "hydrostatic"; then
+   RUNFILE=distributed_hydrostatic_simulation.jl 
+else
+   RUNFILE=distributed_nonhydrostatic_simulation.jl 
+fi
+
+$NSYS srun --mpi=pmi2 ./launch.sh $JULIA --check-bounds=no --project $RUNFILE
diff --git a/validation/distributed_simulations/distributed_scaling/run_tests.sh b/validation/distributed_simulations/distributed_scaling/run_tests.sh
new file mode 100755
index 0000000000..408a1d430f
--- /dev/null
+++ b/validation/distributed_simulations/distributed_scaling/run_tests.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+#####
+##### Disclaimer: this script can be used only on SLURM type systems.
+##### For PBS systems `job_script.sh` needs to be corrected to use the `qsub` syntax
+#####
+
+# Before running, make sure that:
+#
+# 1) The julia environmental variables (defined in this file) point to the right path
+#
+# 2) the NGPUS_PER_NODE variable is correct (in this file)
+#
+# 3) If the system is equipped with nsys profiler it is possible to enable a trace with PROFILE_TRACE=1 (in this file)
+#
+# 4) Oceananigans is instantiated with the correct MPI build:
+# 	(run these lines in a gpu node substituting modules and paths)
+# 	$ module load my_cuda_module
+# 	$ module load my_cuda_aware_mpi_module
+# 	$ export JULIA_DEPOT_PATH="/path/to/depot"
+# 	$ export JULIA="/path/to/julia"
+#   $ $JULIA --check-bounds=no -e 'using Pkg; Pkg.add("MPIPreferences");'
+# 	$ $JULIA --project --check-bounds=no -e 'using MPIPreferences; MPIPreferences.use_system_binaries()'
+#   $ $JULIA --project --check-bounds=no -e 'using Pkg; Pkg.build("MPI")'
+#   $ $JULIA --project --check-bounds=no -e 'using Pkg; Pkg.instantiate()'
+#
+# 5) correct modules are loaded in job_script.sh
+#
+# 6) SBATCH variables in the job_script.sh file are correct (check memory, time)
+#
+# 7) The system has at least max(RX) * max(RY) gpus
+#
+# 8) Choose if testing the hydrostatic or nonhydrostatic model (SIMULATION variable in this file)
+#
+# 9) Choose if measuring the weak or strong scaling (SCALING variable in this file)
+#
+# Finally -> $ ./run_tests.sh
+
+# Julia specific enviromental variables
+export JULIA_DEPOT_PATH="/path/to/depot"
+export JULIA="/path/to/julia"
+
+# PROFILE_TRACE=1 only if the system is equipped with nsys
+export PROFILE_TRACE=0
+
+# Number of gpus per node
+export NGPUS_PER_NODE=4
+	
+# Choice between nonhydrostatic and hydrostatic
+export SIMULATION=nonhydrostatic
+# Choice between strong and weak
+export SCALING=weak
+
+for RX in 1 2 4 8 16 32 64; do
+    for RY in 1 2 4 8 16 32 64; do
+        
+		export RX
+        export RY
+
+		if test $SIMULATION = "hydrostatic"; then 
+			if test $SCALING = "weak"; then
+				# Grid size for Weak scaling tests (Hydrostatic)
+				export NX=$((1440 * RX))
+				export NY=$((600 * RY))
+				export NZ=100 
+			else
+				# Grid size for Strong scaling tests (Hydrostatic)
+				export NX=1440
+				export NY=600
+				export NZ=100 
+			fi
+		else
+			if test $SCALING = "weak"; then
+				# Grid size for Weak scaling tests (Nonhydrostatic)
+				export NX=$((512 * RX))
+				export NY=$((512 * RY))
+				export NZ=256 
+			else
+				# Grid size for Strong scaling tests (Nonhydrostatic)
+				export NX=512
+				export NY=512
+				export NZ=256 
+			fi
+		fi
+		
+		RANKS=$((RX * RY))
+
+		export NNODES=$((RANKS / NGPUS_PER_NODE))
+		export NTASKS=$NGPUS_PER_NODE
+
+		echo ""
+		echo "(RX, RY) = $RX, $RY"
+		echo "(NX, NY) = $NX, $NY"
+		echo "(NNODES, NTASKS) = $NNODES, $NTASKS"
+
+		# ====================================================== #
+		# ================== RUN SCALING TEST ================== #
+		# ====================================================== #
+
+		sbatch -N ${NNODES} --gres=gpu:${NTASKS} --ntasks-per-node=${NTASKS} job_script.sh
+
+		# Use qsub on PBS systems!!!
+		# qsub pbs_job_script.sh
+    done
+done
diff --git a/validation/immersed_boundaries/2D_rough_rayleighbenard.jl b/validation/immersed_boundaries/2D_rough_rayleighbenard.jl
new file mode 100644
index 0000000000..d620e0bb22
--- /dev/null
+++ b/validation/immersed_boundaries/2D_rough_rayleighbenard.jl
@@ -0,0 +1,295 @@
+using Oceananigans
+using CairoMakie
+using Printf
+using JLD2
+using Oceananigans.Models.NonhydrostaticModels: ImmersedPoissonSolver, DiagonallyDominantThreeDimensionalPreconditioner
+
+#####
+##### Model setup
+#####
+
+function run_simulation(solver, preconditioner; Nr, Ra, Nz, Pr=1, IPS_reltol=1e-10)
+    Lx = 1
+    Lz = 1
+    Nx = Nz
+
+    h = Lx / Nr / 2
+    x₀s = h:2h:Lx-h
+
+    ν = 1
+    κ = ν / Pr
+    S = Ra * ν * κ / Lz ^ 4
+    
+    grid = RectilinearGrid(GPU(), Float64,
+                           size = (Nx, Nz), 
+                           halo = (4, 4),
+                           x = (0, Lx),
+                           z = (0, Lz),
+                           topology = (Bounded, Flat, Bounded))
+    
+    @inline function local_roughness_bottom(x, x₀, h)
+        if x > x₀ - h && x <= x₀
+            return x + h - x₀
+        elseif x > x₀ && x <= x₀ + h
+            return -x + h + x₀
+        else
+            return 0
+        end
+    end
+
+    @inline function local_roughness_top(x, x₀, h)
+        if x > x₀ - h && x <= x₀
+            return -x - h + x₀
+        elseif x > x₀ && x <= x₀ + h
+            return x - h - x₀
+        else
+            return 0
+        end
+    end
+
+    @inline roughness_bottom(x, y, z) = z <= sum([local_roughness_bottom(x, x₀, h) for x₀ in x₀s])
+    @inline roughness_top(x, y, z) = z >= sum([local_roughness_top(x, x₀, h) for x₀ in x₀s]) + Lz
+    @inline mask(x, y, z) = roughness_bottom(x, y, z) | roughness_top(x, y, z)
+
+    grid = ImmersedBoundaryGrid(grid, GridFittedBoundary(mask))
+
+    @info "Created $grid"
+
+    @inline function rayleigh_benard_buoyancy(x, y, z, t)
+        above_centerline = z > Lz / 2
+        return ifelse(above_centerline, -S/2, S/2)
+    end
+    
+    u_bcs = FieldBoundaryConditions(top=ValueBoundaryCondition(0), bottom=ValueBoundaryCondition(0), immersed=ValueBoundaryCondition(0))
+
+    v_bcs = FieldBoundaryConditions(top=ValueBoundaryCondition(0), bottom=ValueBoundaryCondition(0),
+                                    east=ValueBoundaryCondition(0), west=ValueBoundaryCondition(0),
+                                    immersed=ValueBoundaryCondition(0))
+
+    w_bcs = FieldBoundaryConditions(east=ValueBoundaryCondition(0), west=ValueBoundaryCondition(0),
+                                    immersed=ValueBoundaryCondition(0))
+
+    b_bcs = FieldBoundaryConditions(top=ValueBoundaryCondition(-S/2), bottom=ValueBoundaryCondition(S/2),
+                                    immersed=ValueBoundaryCondition(rayleigh_benard_buoyancy))
+
+    Δt = 1e-10
+    max_Δt = 1e-5
+    
+    if solver == "FFT"
+        model = NonhydrostaticModel(; grid,
+                                    # advection = CenteredSecondOrder(),
+                                    advection = WENO(order=7),
+                                    tracers = (:b),
+                                    buoyancy = BuoyancyTracer(),
+                                    closure = ScalarDiffusivity(ν=ν, κ=κ),
+                                    # timestepper = :RungeKutta3,
+                                    boundary_conditions=(; u=u_bcs, v=v_bcs, w=w_bcs, b=b_bcs))
+    else
+        model = NonhydrostaticModel(; grid,
+                                    pressure_solver = ImmersedPoissonSolver(grid, preconditioner=preconditioner, reltol=IPS_reltol),
+                                    # advection = CenteredSecondOrder(),
+                                    advection = WENO(order=7),
+                                    tracers = (:b),
+                                    buoyancy = BuoyancyTracer(),
+                                    closure = ScalarDiffusivity(ν=ν, κ=κ),
+                                    # timestepper = :RungeKutta3,
+                                    boundary_conditions=(; u=u_bcs, v=v_bcs, w=w_bcs, b=b_bcs))
+    end
+
+    @info "Created $model"
+    @info "with pressure solver $(model.pressure_solver)"
+    @info "with b boundary conditions $(model.tracers.b.boundary_conditions)"
+
+    # b_initial(x, y, z) = -S*z + S/2 - rand() * Ra / 100000
+    b_initial(x, y, z) = - rand() * Ra / 100000
+    
+    set!(model, b=b_initial)
+    
+    #####
+    ##### Simulation
+    #####
+    
+    simulation = Simulation(model, Δt=Δt, stop_iteration=1500000)
+
+    # wizard = TimeStepWizard(max_change=1.05, max_Δt=max_Δt, cfl=0.6)
+    # simulation.callbacks[:wizard] = Callback(wizard, IterationInterval(1))
+
+    wall_time = Ref(time_ns())
+    
+    b = model.tracers.b
+    u, v, w = model.velocities
+    WB = Average(w * b, dims=(1, 2))
+    
+    δ = Field(∂x(u) + ∂y(v) + ∂z(w))
+    compute!(δ)
+
+    function print_progress(sim)
+        elapsed = time_ns() - wall_time[]
+
+        msg = @sprintf("[%05.2f%%] i: %d, t: %s, wall time: %s, max(u): (%6.3e, %6.3e, %6.3e) m/s, max(b) %6.3e, next Δt: %s",
+                        100 * (sim.model.clock.time / sim.stop_time),
+                        sim.model.clock.iteration,
+                        prettytime(sim.model.clock.time),
+                        prettytime(1e-9 * elapsed),
+                        maximum(abs, sim.model.velocities.u),
+                        maximum(abs, sim.model.velocities.v),
+                        maximum(abs, sim.model.velocities.w),
+                        maximum(abs, sim.model.tracers.b),
+                        prettytime(sim.Δt))
+
+        pressure_solver = sim.model.pressure_solver
+        if sim.model.pressure_solver isa ImmersedPoissonSolver
+            solver_iterations = pressure_solver.pcg_solver.iteration 
+            msg *= string(", solver iterations: ", solver_iterations)
+        end
+    
+        @info msg
+    
+        wall_time[] = time_ns()
+    
+        return nothing
+    end
+                       
+    simulation.callbacks[:p] = Callback(print_progress, IterationInterval(1000))
+    
+    solver_type = model.pressure_solver isa ImmersedPoissonSolver ? "ImmersedPoissonSolver_reltol_$(IPS_reltol)" : "FFTBasedPoissonSolver"
+    prefix = "2D_rough_rayleighbenard_" * solver_type
+    
+    outputs = merge(model.velocities, model.tracers, (; δ))
+
+    function init_save_some_metadata!(file, model)
+        file["metadata/author"] = "Xin Kai Lee"
+        file["metadata/parameters/density"] = 1027
+        file["metadata/parameters/rayleigh_number"] = Ra
+        file["metadata/parameters/prandtl_number"] = Pr
+        return nothing
+    end
+    
+    simulation.output_writers[:jld2] = JLD2OutputWriter(model, outputs;
+                                                        filename = prefix * "_Ra_$(Ra)_Nr_$(Nr)_Nz_$(Nz)_WENO_fields",
+                                                        # schedule = TimeInterval(5e-4),
+                                                        schedule = IterationInterval(10000),
+                                                        overwrite_existing = true,
+                                                        init = init_save_some_metadata!)
+    
+    simulation.output_writers[:timeseries] = JLD2OutputWriter(model, (; WB);
+                                                              filename = prefix * "_Ra_$(Ra)_Nr_$(Nr)_Nz_$(Nz)_WENO_time_series",
+                                                            #   schedule = TimeInterval(5e-4),
+                                                        schedule = IterationInterval(10000),
+                                                              overwrite_existing = true,
+                                                          init = init_save_some_metadata!)
+    
+    run!(simulation)
+end
+
+Nr = 8
+Ra = 1e11
+Nz = 512
+IPS_reltol = 1e-10
+
+run_simulation("ImmersedPoissonSolver", "FFT", Nr=Nr, Ra=Ra, Nz=Nz, IPS_reltol=IPS_reltol)
+run_simulation("FFT", nothing, Nr=Nr, Ra=Ra, Nz=Nz)
+#####
+##### Visualize
+#####
+##
+filename_FFT = "2D_rough_rayleighbenard_FFTBasedPoissonSolver_Ra_$(Ra)_Nr_$(Nr)_Nz_$(Nz)_WENO_fields.jld2"
+filename_FFT_timeseries = "2D_rough_rayleighbenard_FFTBasedPoissonSolver_Ra_$(Ra)_Nr_$(Nr)_Nz_$(Nz)_WENO_time_series.jld2"
+
+metadata = jldopen(filename_FFT, "r") do file
+    metadata = Dict()
+    for key in keys(file["metadata/parameters"])
+        metadata[key] = file["metadata/parameters/$(key)"]
+    end
+    return metadata
+end
+
+κ = 1
+S = metadata["rayleigh_number"]
+
+bt_FFT = FieldTimeSeries(filename_FFT, "b")
+ut_FFT = FieldTimeSeries(filename_FFT, "u")
+wt_FFT = FieldTimeSeries(filename_FFT, "w")
+δt_FFT = FieldTimeSeries(filename_FFT, "δ")
+Nu_FFT = FieldTimeSeries(filename_FFT_timeseries, "WB") ./ (κ * S)
+times = bt_FFT.times
+
+filename_PCG = "2D_rough_rayleighbenard_ImmersedPoissonSolver_Ra_$(Ra)_Nr_$(Nr)_Nz_$(Nz)_WENO_fields.jld2"
+filename_PCG_timeseries = "2D_rough_rayleighbenard_ImmersedPoissonSolver_Ra_$(Ra)_Nr_$(Nr)_Nz_$(Nz)_WENO_time_series.jld2"
+
+bt_PCG = FieldTimeSeries(filename_PCG, "b")
+ut_PCG = FieldTimeSeries(filename_PCG, "u")
+wt_PCG = FieldTimeSeries(filename_PCG, "w")
+δt_PCG = FieldTimeSeries(filename_PCG, "δ")
+Nu_PCG = FieldTimeSeries(filename_PCG_timeseries, "WB") ./ (κ * S)
+
+fig = Figure(resolution=(1500, 1000))
+n = Observable(1)
+
+titlestr = @lift @sprintf("t = %.2f", times[$n])
+
+blim = maximum([maximum(abs, bt_FFT), maximum(abs, bt_PCG)])
+ulim = maximum([maximum(abs, ut_FFT), maximum(abs, ut_PCG)])
+wlim = maximum([maximum(abs, wt_FFT), maximum(abs, wt_PCG)])
+δlim = maximum([maximum(abs, δt_FFT), maximum(abs, δt_PCG)])
+# δlim = sqrt(eps(eltype(δt_FFT[1])))
+# δlim = 1e-7
+Nulim = maximum([maximum(abs, Nu_FFT), maximum(abs, Nu_PCG)])
+
+axb_FFT = Axis(fig[1, 1], title="b (FFT solver)")
+axu_FFT = Axis(fig[1, 2], title="u (FFT solver)")
+axw_FFT = Axis(fig[1, 3], title="w (FFT solver)")
+axd_FFT = Axis(fig[1, 4], title="Divergence (FFT solver), colorrange=($(-δlim), $(δlim))")
+
+axb_PCG = Axis(fig[2, 1], title="b (PCG solver)")
+axu_PCG = Axis(fig[2, 2], title="u (PCG solver)")
+axw_PCG = Axis(fig[2, 3], title="w (PCG solver)")
+axd_PCG = Axis(fig[2, 4], title="Divergence (PCG solver), colorrange=($(-δlim), $(δlim))")
+
+axNu = Axis(fig[3, 2:3], title="Nu", xlabel="Nu", ylabel="z")
+
+bn_FFT = @lift interior(bt_FFT[$n], :, 1, :)
+un_FFT = @lift interior(ut_FFT[$n], :, 1, :)
+wn_FFT = @lift interior(wt_FFT[$n], :, 1, :)
+δn_FFT = @lift interior(δt_FFT[$n], :, 1, :)
+Nun_FFT = @lift Nu_FFT[1, 1, :, $n]
+
+bn_PCG = @lift interior(bt_PCG[$n], :, 1, :)
+un_PCG = @lift interior(ut_PCG[$n], :, 1, :)
+wn_PCG = @lift interior(wt_PCG[$n], :, 1, :)
+δn_PCG = @lift interior(δt_PCG[$n], :, 1, :)
+Nun_PCG = @lift Nu_PCG[1, 1, :, $n]
+
+Nx = bt_FFT.grid.Nx
+Nz = bt_FFT.grid.Nz
+Nt = length(bt_FFT.times)
+
+xC = bt_FFT.grid.xᶜᵃᵃ[1:Nx]
+zC = bt_FFT.grid.zᵃᵃᶜ[1:Nz]
+xNu, yNu, zNu = nodes(FieldTimeSeries(filename_FFT_timeseries, "WB"))
+
+heatmap!(axb_FFT, xC, zC, bn_FFT, colormap=:balance, colorrange=(-blim, blim))
+heatmap!(axu_FFT, xC, zC, un_FFT, colormap=:balance, colorrange=(-ulim, ulim))
+heatmap!(axw_FFT, xC, zC, wn_FFT, colormap=:balance, colorrange=(-wlim, wlim))
+heatmap!(axd_FFT, xC, zC, δn_FFT, colormap=:balance, colorrange=(-δlim, δlim))
+
+heatmap!(axb_PCG, xC, zC, bn_PCG, colormap=:balance, colorrange=(-blim, blim))
+heatmap!(axu_PCG, xC, zC, un_PCG, colormap=:balance, colorrange=(-ulim, ulim))
+heatmap!(axw_PCG, xC, zC, wn_PCG, colormap=:balance, colorrange=(-wlim, wlim))
+heatmap!(axd_PCG, xC, zC, δn_PCG, colormap=:balance, colorrange=(-δlim, δlim))
+
+lines!(axNu, Nun_FFT, zNu)
+lines!(axNu, Nun_PCG, zNu)
+xlims!(axNu, (0, Nulim))
+
+Label(fig[0, :], titlestr, font=:bold, tellwidth=false, tellheight=false)
+
+# display(fig)
+
+record(fig, "FFT_PCG_reltol_$(IPS_reltol)_2D_rough_rayleighbenard_Ra_$(Ra)_Nr_$(Nr)_Nz_$(Nz)_WENO7.mp4", 1:Nt, framerate=10) do nn
+    # @info string("Plotting frame ", nn, " of ", Nt)
+    n[] = nn
+end
+
+@info "Animation completed"
+##
\ No newline at end of file
diff --git a/validation/immersed_boundaries/mask_binary_operations.jl b/validation/immersed_boundaries/mask_binary_operations.jl
new file mode 100644
index 0000000000..fda92fb7d0
--- /dev/null
+++ b/validation/immersed_boundaries/mask_binary_operations.jl
@@ -0,0 +1,43 @@
+using Oceananigans, CairoMakie
+using Oceananigans.ImmersedBoundaries: mask_immersed_field!
+
+underlying_grid = RectilinearGrid(architecture; size = (32, 32, 32), x = (-20, 20), y = (-20, 20), z = (-20, 0))
+
+bathy(x, y) = 20 * exp(-(x^2 + y^2) / (2*10^2)) - 20
+
+grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(bathy))
+
+c1 = CenterField(grid)
+c2 = CenterField(grid)
+
+Σc = c1 + c2
+Πc = c1 * c2
+ratio_c = c1 / c2
+pow_c = c1 ^ c2
+
+# Σc = 5, Πc = 6, ratio_c = 2/3, pow_c = 8
+
+fig = Figure()
+
+axs = [Axis(fig[1, 1], title = "+"),
+       Axis(fig[1, 2], title = "*"),
+       Axis(fig[2, 1], title = "/"),
+       Axis(fig[2, 2], title = "^")]
+
+xc, yc, zc = nodes(grid, Center(), Center(), Center())
+
+cos = []
+
+for (n, f) in enumerate([Σc, Πc, ratio_c, pow_c])
+    c1 isa Number || set!(c1, 2)
+    c2 isa Number || set!(c2, 3)
+
+    mask_immersed_field!(f)
+
+    @info f[1, 1, 16], f[16, 16, 16]
+
+    heatmap!(axs[n], xc, yc, [f[i, j, 16] for i in 1:grid.Nx, j in 1:grid.Ny], colorrange = (0, 8))
+end
+
+fig
+
diff --git a/validation/immersed_boundaries/nonlinear_topography.jl b/validation/immersed_boundaries/nonlinear_topography.jl
new file mode 100644
index 0000000000..5628f8fd17
--- /dev/null
+++ b/validation/immersed_boundaries/nonlinear_topography.jl
@@ -0,0 +1,232 @@
+using Oceananigans
+using CairoMakie
+using Printf
+using Roots
+using Oceananigans.Models.NonhydrostaticModels: ImmersedPoissonSolver
+
+#####
+##### Model setup
+#####
+
+function run_simulation(solver, preconditioner)
+    Nx = 1024
+    Nz = Nx * 2
+    Ny = 1
+    
+    grid = RectilinearGrid(GPU(), Float64,
+                           size = (Nx, Ny, Nz), 
+                           halo = (4, 4, 4),
+                           x = (0, 30),
+                           y = (0, 1),
+                           z = (0, 60),
+                           topology = (Periodic, Periodic, Bounded))
+    
+    k = 2π / 10
+    Δt = 1e-3
+    max_Δt = 1e-3
+
+    N² = 1 / (150 * 1e-3)^2
+    U₀ = 5
+
+    m = √(N² / U₀^2 - k^2)
+    h₀ = 0.5
+
+    function nonlinear_topography(h, x)
+        return h₀ * cos(k*x + m*h) - h
+        # return h₀ * cos(k*x) * exp(-m*h) - h
+    end
+
+    topography(x, y) = find_zero(h -> nonlinear_topography(h, x), 0.1) + h₀
+
+    grid = ImmersedBoundaryGrid(grid, GridFittedBottom(topography))
+    
+    @info "Created $grid"
+    
+    uv_bcs = FieldBoundaryConditions(top=FluxBoundaryCondition(0), bottom=ValueBoundaryCondition(0), immersed=ValueBoundaryCondition(0))
+    
+    b_initial(x, y, z) = N² * z
+    b_target = LinearTarget{:z}(intercept=0, gradient=N²)
+    mask_top = GaussianMask{:z}(center=58, width=0.5)
+
+    damping_rate = 1 / (3 * Δt)
+
+    v_sponge = w_sponge = Relaxation(rate=damping_rate, mask=mask_top)
+    u_sponge = Relaxation(rate=damping_rate, mask=mask_top, target=U₀)
+    b_sponge = Relaxation(rate=damping_rate, mask=mask_top, target=b_target)
+    
+    if solver == "FFT"
+        model = NonhydrostaticModel(; grid,
+                                    # advection = WENO(),
+                                    advection = CenteredSecondOrder(),
+                                    tracers = :b,
+                                    buoyancy = BuoyancyTracer(),
+                                    # timestepper = :RungeKutta3,
+                                    boundary_conditions=(; u=uv_bcs, v=uv_bcs),
+                                    forcing=(u=u_sponge, v=v_sponge, w=w_sponge, b=b_sponge))
+    else
+        model = NonhydrostaticModel(; grid,
+                                    pressure_solver = ImmersedPoissonSolver(grid, preconditioner=preconditioner, reltol=1e-8),
+                                    # advection = WENO(),
+                                    advection = CenteredSecondOrder(),
+                                    tracers = :b,
+                                    buoyancy = BuoyancyTracer(),
+                                    # timestepper = :RungeKutta3,
+                                    boundary_conditions=(; u=uv_bcs, v=uv_bcs),
+                                    forcing=(u=u_sponge, v=v_sponge, w=w_sponge, b=b_sponge))
+    end
+
+    @info "Created $model"
+    @info "with pressure solver $(model.pressure_solver)"
+    
+    set!(model, b=b_initial, u=U₀)
+    
+    #####
+    ##### Simulation
+    #####
+    
+    simulation = Simulation(model, Δt=Δt, stop_time=20)
+
+    # wizard = TimeStepWizard(max_change=1.05, max_Δt=max_Δt, cfl=0.6)
+    # simulation.callbacks[:wizard] = Callback(wizard, IterationInterval(1))
+
+    wall_time = Ref(time_ns())
+    
+    b = model.tracers.b
+    u, v, w = model.velocities
+    B = Field(Integral(b))
+    compute!(B)
+    
+    δ = Field(∂x(u) + ∂y(v) + ∂z(w))
+    compute!(δ)
+    
+    ζ = Field(∂z(u) - ∂x(w))
+    compute!(ζ)
+    
+    function print_progress(sim)
+        elapsed = time_ns() - wall_time[]
+
+        msg = @sprintf("[%05.2f%%] i: %d, t: %s, wall time: %s, max(u): (%6.3e, %6.3e, %6.3e) m/s, max(b) %6.3e, next Δt: %s",
+                        100 * (sim.model.clock.time / sim.stop_time),
+                        sim.model.clock.iteration,
+                        prettytime(sim.model.clock.time),
+                        prettytime(1e-9 * elapsed),
+                        maximum(abs, sim.model.velocities.u),
+                        maximum(abs, sim.model.velocities.v),
+                        maximum(abs, sim.model.velocities.w),
+                        maximum(abs, sim.model.tracers.b),
+                        prettytime(sim.Δt))
+
+        pressure_solver = sim.model.pressure_solver
+        if sim.model.pressure_solver isa ImmersedPoissonSolver
+            solver_iterations = pressure_solver.pcg_solver.iteration 
+            msg *= string(", solver iterations: ", solver_iterations)
+        end
+    
+        @info msg
+    
+        wall_time[] = time_ns()
+    
+        return nothing
+    end
+                       
+    simulation.callbacks[:p] = Callback(print_progress, IterationInterval(100))
+    
+    solver_type = model.pressure_solver isa ImmersedPoissonSolver ? "ImmersedPoissonSolver" : "FFTBasedPoissonSolver"
+    prefix = "nonlinear_topography_" * solver_type
+    
+    outputs = merge(model.velocities, model.tracers, (; p=model.pressures.pNHS, δ, ζ))
+    
+    simulation.output_writers[:jld2] = JLD2OutputWriter(model, outputs;
+                                                        filename = prefix * "_fields",
+                                                        # schedule = TimeInterval(2e-3),
+                                                        schedule = IterationInterval(50),
+                                                        overwrite_existing = true)
+    
+    simulation.output_writers[:timeseries] = JLD2OutputWriter(model, (; B);
+                                                              filename = prefix * "_time_seriess",
+                                                            # schedule = TimeInterval(2e-3),
+                                                        schedule = IterationInterval(50),
+                                                              overwrite_existing = true)
+    
+    run!(simulation)
+end
+
+run_simulation("ImmersedPoissonSolver", "FFT")
+run_simulation("FFT", nothing)
+
+#####
+##### Visualize
+#####
+##
+@info "Loading files"
+filename_FFT = "nonlinear_topography_FFTBasedPoissonSolver_fields.jld2"
+bt_FFT = FieldTimeSeries(filename_FFT, "b")
+ut_FFT = FieldTimeSeries(filename_FFT, "u")
+wt_FFT = FieldTimeSeries(filename_FFT, "w")
+δt_FFT = FieldTimeSeries(filename_FFT, "δ")
+times = bt_FFT.times
+
+filename_PCG = "nonlinear_topography_ImmersedPoissonSolver_fields.jld2"
+bt_PCG = FieldTimeSeries(filename_PCG, "b")
+ut_PCG = FieldTimeSeries(filename_PCG, "u")
+wt_PCG = FieldTimeSeries(filename_PCG, "w")
+δt_PCG = FieldTimeSeries(filename_PCG, "δ")
+
+@info "Plotting"
+fig = Figure(resolution=(2000, 700))
+n = Observable(1)
+
+titlestr = @lift @sprintf("t = %.2f", times[$n])
+
+axb_FFT = Axis(fig[1, 1], title="b (FFT solver)")
+axu_FFT = Axis(fig[1, 2], title="u (FFT solver)")
+axw_FFT = Axis(fig[1, 3], title="w (FFT solver)")
+axd_FFT = Axis(fig[1, 4], title="Divergence (FFT solver)")
+
+axb_PCG = Axis(fig[2, 1], title="b (PCG solver)")
+axu_PCG = Axis(fig[2, 2], title="u (PCG solver)")
+axw_PCG = Axis(fig[2, 3], title="w (PCG solver)")
+axd_PCG = Axis(fig[2, 4], title="Divergence (PCG solver)")
+
+bn_FFT = @lift interior(bt_FFT[$n], :, 1, :)
+un_FFT = @lift interior(ut_FFT[$n], :, 1, :)
+wn_FFT = @lift interior(wt_FFT[$n], :, 1, :)
+δn_FFT = @lift interior(δt_FFT[$n], :, 1, :)
+
+bn_PCG = @lift interior(bt_PCG[$n], :, 1, :)
+un_PCG = @lift interior(ut_PCG[$n], :, 1, :)
+wn_PCG = @lift interior(wt_PCG[$n], :, 1, :)
+δn_PCG = @lift interior(δt_PCG[$n], :, 1, :)
+
+Nx = bt_FFT.grid.Nx
+Nz = bt_FFT.grid.Nz
+Nt = length(bt_FFT.times)
+
+xC = bt_FFT.grid.xᶜᵃᵃ[1:Nx]
+zC = bt_FFT.grid.zᵃᵃᶜ[1:Nz]
+
+blim = maximum([maximum(abs, bt_FFT), maximum(abs, bt_PCG)])
+ulim = maximum([maximum(abs, ut_FFT), maximum(abs, ut_PCG)])
+wlim = maximum([maximum(abs, wt_FFT), maximum(abs, wt_PCG)])
+δlim = 1e-8
+
+heatmap!(axb_FFT, xC, zC, bn_FFT, colormap=:balance, colorrange=(0, blim))
+heatmap!(axu_FFT, xC, zC, un_FFT, colormap=:balance, colorrange=(-ulim, ulim))
+heatmap!(axw_FFT, xC, zC, wn_FFT, colormap=:balance, colorrange=(-wlim, wlim))
+heatmap!(axd_FFT, xC, zC, δn_FFT, colormap=:balance, colorrange=(-δlim, δlim))
+
+heatmap!(axb_PCG, xC, zC, bn_PCG, colormap=:balance, colorrange=(0, blim))
+heatmap!(axu_PCG, xC, zC, un_PCG, colormap=:balance, colorrange=(-ulim, ulim))
+heatmap!(axw_PCG, xC, zC, wn_PCG, colormap=:balance, colorrange=(-wlim, wlim))
+heatmap!(axd_PCG, xC, zC, δn_PCG, colormap=:balance, colorrange=(-δlim, δlim))
+
+Label(fig[0, :], titlestr, font=:bold, tellwidth=false, tellheight=false)
+
+# display(fig)
+
+record(fig, "FFT_PCG_nonlinear_topography.mp4", 1:Nt, framerate=10) do nn
+    # @info string("Plotting frame ", nn, " of ", Nt)
+    n[] = nn
+end
+@info "Animation completed"
+##
\ No newline at end of file
diff --git a/validation/multi_region/cubed_sphere_aquaplanet_visualization.jl b/validation/multi_region/cubed_sphere_aquaplanet_visualization.jl
index bafa569ddd..882cb4ead4 100644
--- a/validation/multi_region/cubed_sphere_aquaplanet_visualization.jl
+++ b/validation/multi_region/cubed_sphere_aquaplanet_visualization.jl
@@ -4,7 +4,6 @@ using JLD2
 using KernelAbstractions: @kernel, @index
 using KernelAbstractions.Extras.LoopInfo: @unroll
 using Oceananigans
-using Oceananigans.BuoyancyModels: ∂z_b
 using Oceananigans.Coriolis: fᶠᶠᵃ
 using Oceananigans.Grids: node, λnode, φnode, halo_size, total_size
 using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid
@@ -52,6 +51,42 @@ function geometric_z_faces(p)
     return z_faces
 end
 
+function hyperbolic_tangential_z_faces(Lz)
+    Δz_tolerance = 1e-2
+    N = 20
+    b = (atanh(1 - Δz_tolerance) - atanh(-1 + Δz_tolerance))/(N-1)
+    k₀ = 1 - atanh(-1 + Δz_tolerance)/b
+    a = 45
+    c = a + 10
+    Δz = zeros(N)
+    for k in 1:N
+        Δz[k] = a * tanh(b*(k - k₀)) + c
+    end
+    Nz₁ = 10
+    Nz₂ = N
+    Nz₃ = trunc(Int, (Lz - sum(Δz) - 100) ÷ 100)
+    Nz = Nz₁ + Nz₂ + Nz₃
+    z_faces = zeros(Nz+1)
+    for k in 1:Nz₁+1
+        z_faces[k] = 10(k - 1)
+    end
+    for k in Nz₁+2:Nz₁+Nz₂+1
+        z_faces[k] = z_faces[k-1] + Δz[k-Nz₁-1]
+    end
+    for k in Nz₁+Nz₂+2:Nz+1
+        z_faces[k] = z_faces[k-1] + 100
+    end
+    z_faces = reverse(-z_faces)
+    return z_faces
+end
+
+function custom_z_faces()
+    z_faces = [-3000, -2900, -2800, -2700, -2600, -2500, -2400, -2300, -2200, -2100, -2000, -1900, -1800, -1700, -1600,
+               -1500, -1400, -1300, -1200, -1100, -1002, -904, -809, -717, -629, -547, -472, -404, -345, -294, -252,
+               -217, -189, -167, -149, -134, -122, -110, -100, -90, -80, -70, -60, -50, -40, -30, -20, -10, 0]
+    return z_faces
+end
+
 Lz = 3000
 h_b = 0.2 * Lz
 h_νz_κz = 100
@@ -85,18 +120,18 @@ my_parameters = (Lz          = Lz,
 
 radius = 6371e3
 f₀ = 1e-4
-Ld = (2/f₀ * sqrt(my_parameters.h_b * my_parameters.Δ/(1 - exp(-my_parameters.Lz/my_parameters.h_b)))
-      * (1 - exp(-my_parameters.Lz/(2my_parameters.h_b))))
+L_d = (2/f₀ * sqrt(my_parameters.h_b * my_parameters.Δ/(1 - exp(-my_parameters.Lz/my_parameters.h_b)))
+       * (1 - exp(-my_parameters.Lz/(2my_parameters.h_b))))
 print(
-"For an initial buoyancy profile decaying exponentially with depth, the Rossby radius of deformation is $Ld m.\n")
-Nx_min = ceil(Int, 2π * radius/(4Ld))
+"For an initial buoyancy profile decaying exponentially with depth, the Rossby radius of deformation is $L_d m.\n")
+Nx_min = ceil(Int, 2π * radius/(4L_d))
 print("The minimum number of grid points in each direction of the cubed sphere panels required to resolve this " *
       "Rossby radius of deformation is $(Nx_min).\n")
 
 arch = CPU()
 underlying_grid = ConformalCubedSphereGrid(arch;
                                            panel_size = (Nx, Ny, Nz),
-                                           z = geometric_z_faces(my_parameters),
+                                           z = hyperbolic_tangential_z_faces(Lz),
                                            horizontal_direction_halo = Nhalo,
                                            radius,
                                            partition = CubedSpherePartition(; R = 1))
@@ -288,130 +323,53 @@ import Oceananigans.Utils: getregion, _getregion
     return p.𝓋 * (b - B)
 end
 
-####
-#### Boundary conditions
-####
-
-@inline ϕ²(i, j, k, grid, ϕ) = @inbounds ϕ[i, j, k]^2
-
-@inline speedᶠᶜᶜ(i, j, k, grid, u, v) = @inbounds sqrt(u[i, j, k]^2 + ℑxyᶠᶜᵃ(i, j, k, grid, ϕ², v))
-@inline speedᶜᶠᶜ(i, j, k, grid, u, v) = @inbounds sqrt(ℑxyᶜᶠᵃ(i, j, k, grid, ϕ², u) + v[i, j, k]^2)
-
-@inline u_drag(i, j, grid, clock, fields, p) = (
-@inbounds - p.Cᴰ * speedᶠᶜᶜ(i, j, 1, grid, fields.u, fields.v) * fields.u[i, j, 1])
-@inline v_drag(i, j, grid, clock, fields, p) = (
-@inbounds - p.Cᴰ * speedᶜᶠᶜ(i, j, 1, grid, fields.u, fields.v) * fields.v[i, j, 1])
-
-u_bot_bc = FluxBoundaryCondition(u_drag, discrete_form = true, parameters = (; Cᴰ = my_parameters.Cᴰ))
-v_bot_bc = FluxBoundaryCondition(v_drag, discrete_form = true, parameters = (; Cᴰ = my_parameters.Cᴰ))
-top_stress_x = FluxBoundaryCondition(u_stress; discrete_form = true)
-top_stress_y = FluxBoundaryCondition(v_stress; discrete_form = true)
-
-u_bcs = FieldBoundaryConditions(bottom = u_bot_bc, top = top_stress_x)
-v_bcs = FieldBoundaryConditions(bottom = v_bot_bc, top = top_stress_y)
+extended_halos = true
+coriolis = HydrostaticSphericalCoriolis()
 
 my_buoyancy_parameters = (; Δ = my_parameters.Δ, h = my_parameters.h_b, Lz = my_parameters.Lz,
                             φ_max_b_lin = my_parameters.φ_max_b_lin, φ_max_b_par = my_parameters.φ_max_b_par,
                             φ_max_b_cos = my_parameters.φ_max_b_cos, 𝓋 = my_parameters.𝓋)
-top_restoring_bc = FluxBoundaryCondition(buoyancy_restoring; field_dependencies = :b,
-                                         parameters = my_buoyancy_parameters)
-b_bcs = FieldBoundaryConditions(top = top_restoring_bc)
-
-####
-#### Model setup
-####
-
-momentum_advection = VectorInvariant()
-tracer_advection   = WENO()
-substeps           = 50
-free_surface       = SplitExplicitFreeSurface(grid; substeps, extended_halos = false)
-
-νh = 5e+3
-κh = 1e+2 
-horizontal_diffusivity = HorizontalScalarDiffusivity(ν=νh, κ=κh) # Laplacian viscosity and diffusivity
-
-νz_surface = 1e-3
-νz_bottom = 1e-4
-
-struct MyVerticalViscosity{FT} <: Function
-    Lz  :: FT
-    h   :: FT
-    νzs :: FT
-    νzb :: FT
-end
-
-using Adapt
-
-Adapt.adapt_structure(to, ν::MyVerticalViscosity) = MyVerticalViscosity(Adapt.adapt(to, ν.Lz),  Adapt.adapt(to, ν.h),
-                                                                        Adapt.adapt(to, ν.νzs), Adapt.adapt(to, ν.νzb))
-
-@inline (ν::MyVerticalViscosity)(x, y, z, t) = ν.νzb + (ν.νzs - ν.νzb) * exponential_profile_in_z(z, ν.Lz, ν.h)
-
-νz = MyVerticalViscosity(float(Lz), float(h_νz_κz), νz_surface, νz_bottom)
-
-κz_surface = 2e-4
-κz_bottom = 2e-5
-
-κz = MyVerticalViscosity(float(Lz), float(h_νz_κz), κz_surface, κz_bottom)
-
-vertical_diffusivity  = VerticalScalarDiffusivity(VerticallyImplicitTimeDiscretization(), ν = νz, κ = κz)
-
-convective_adjustment = ConvectiveAdjustmentVerticalDiffusivity(VerticallyImplicitTimeDiscretization(),
-                                                                convective_κz = 1.0)
-
-coriolis = HydrostaticSphericalCoriolis()
-
-model = HydrostaticFreeSurfaceModel(; grid,
-                                      momentum_advection,
-                                      tracer_advection,
-                                      free_surface,
-                                      coriolis,
-                                      closure = (horizontal_diffusivity, vertical_diffusivity, convective_adjustment),
-                                      tracers = :b,
-                                      buoyancy = BuoyancyTracer(),
-                                      boundary_conditions = (u = u_bcs, v = v_bcs, b = b_bcs))
-
-#####
-##### Model initialization
-#####
-
 @inline initial_buoyancy(λ, φ, z) = (my_buoyancy_parameters.Δ * cosine_profile_in_y(φ, my_buoyancy_parameters)
                                      * exponential_profile_in_z(z, my_parameters.Lz, my_parameters.h_b))
 # Specify the initial buoyancy profile to match the buoyancy restoring profile.
-set!(model, b = initial_buoyancy) 
+bᵢ = CenterField(grid)
+set!(bᵢ, initial_buoyancy)
+
+uᵢ = XFaceField(grid)
+vᵢ = YFaceField(grid)
 
 initialize_velocities_based_on_thermal_wind_balance = false
 # If the above flag is set to true, meaning the velocities are initialized using thermal wind balance, set
 # φ_max_b_cos within the range [70, 80], and specify the latitudinal variation in buoyancy as
 # p.Δ * double_cosine_profile_in_y(φ, p) in both the initial buoyancy and the surface buoyancy restoring profiles.
 if initialize_velocities_based_on_thermal_wind_balance
-    fill_halo_regions!(model.tracers.b)
+    fill_halo_regions!(bᵢ)
 
-    Ω = model.coriolis.rotation_rate
+    Ω = coriolis.rotation_rate
     radius = grid.radius
 
     for region in 1:number_of_regions(grid), k in 1:Nz, j in 1:Ny, i in 1:Nx
-        numerator = model.tracers.b[region][i, j, k] - model.tracers.b[region][i, j-1, k]
+        numerator = bᵢ[region][i, j, k] - bᵢ[region][i, j-1, k]
         denominator = -2Ω * sind(grid[region].φᶠᶜᵃ[i, j]) * grid[region].Δyᶠᶜᵃ[i, j]
         if k == 1
             Δz_below = grid[region].zᵃᵃᶜ[k] - grid[region].zᵃᵃᶠ[k]
             u_below = 0 # no slip boundary condition
         else
             Δz_below = grid[region].Δzᵃᵃᶠ[k]
-            u_below = model.velocities.u[region][i, j, k-1]
+            u_below = uᵢ[region][i, j, k-1]
         end
-        model.velocities.u[region][i, j, k] = u_below + numerator/denominator * Δz_below
-        numerator = model.tracers.b[region][i, j, k] - model.tracers.b[region][i-1, j, k]
+        uᵢ[region][i, j, k] = u_below + numerator/denominator * Δz_below
+        numerator = bᵢ[region][i, j, k] - bᵢ[region][i-1, j, k]
         denominator = 2Ω * sind(grid[region].φᶜᶠᵃ[i, j]) * grid[region].Δxᶜᶠᵃ[i, j]
         if k == 1
             v_below = 0 # no slip boundary condition
         else
-            v_below = model.velocities.v[region][i, j, k-1]
+            v_below = vᵢ[region][i, j, k-1]
         end
-        model.velocities.v[region][i, j, k] = v_below + numerator/denominator * Δz_below
+        vᵢ[region][i, j, k] = v_below + numerator/denominator * Δz_below
     end
 
-    fill_halo_regions!((model.velocities.u, model.velocities.v))
+    fill_halo_regions!((uᵢ, vᵢ))
 end
 
 # Compute the initial vorticity.
@@ -433,7 +391,7 @@ function compute_vorticity!(grid, u, v, ζ)
     end
 end
 
-compute_vorticity!(grid, model.velocities.u, model.velocities.v, ζ)
+compute_vorticity!(grid, uᵢ, vᵢ, ζ)
 
 # Compute actual and reconstructed wind stress.
 location = (Center(), Center(), Center())
@@ -480,10 +438,7 @@ end
 @apply_regionally launch!(arch, grid, (Nx, Ny), _reconstruct_wind_stress!, grid, zonal_wind_stress_cc, τ_x, τ_x_r, τ_y_r)
 
 # Plot wind stress and initial fields.
-uᵢ = on_architecture(CPU(), deepcopy(model.velocities.u))
-vᵢ = on_architecture(CPU(), deepcopy(model.velocities.v))
 ζᵢ = on_architecture(CPU(), deepcopy(ζ))
-bᵢ = on_architecture(CPU(), deepcopy(model.tracers.b))
 
 latitude = extract_latitude(grid_cpu)
 cos_θ, sin_θ = calculate_sines_and_cosines_of_cubed_sphere_grid_angles(grid_cpu, "cc")
@@ -536,18 +491,21 @@ else
 end
 common_kwargs_η = common_kwargs_vertical_section
 
-@inline _deformation_radius(i, j, k, grid, C, buoyancy, coriolis) = sqrt(max(0, ∂z_b(i, j, k, grid, buoyancy, C))) / π /
-                                                                         abs(ℑxyᶜᶜᵃ(i, j, k, grid, fᶠᶠᵃ, coriolis))
+import Oceananigans.BuoyancyModels: ∂z_b
+@inline ∂z_b(i, j, k, grid, buoyancy) = ∂zᶜᶜᶠ(i, j, k, grid, buoyancy)
+
+@inline _deformation_radius(i, j, k, grid, buoyancy, coriolis) = (
+sqrt(max(0, ∂z_b(i, j, k, grid, buoyancy))) / π / abs(ℑxyᶜᶜᵃ(i, j, k, grid, fᶠᶠᵃ, coriolis)))
 
 φ_max_b = 75
 
-@kernel function _calculate_deformation_radius!(Ld, grid, tracers, buoyancy, coriolis)
+@kernel function _calculate_deformation_radius!(Ld, grid, buoyancy, coriolis)
     i, j = @index(Global, NTuple)
 
     @inbounds begin
         Ld[i, j, 1] = 0
         @unroll for k in 1:grid.Nz
-            Ld[i, j, 1] += Δzᶜᶜᶠ(i, j, k, grid) * _deformation_radius(i, j, k, grid, tracers, buoyancy, coriolis)
+            Ld[i, j, 1] += Δzᶜᶜᶠ(i, j, k, grid) * _deformation_radius(i, j, k, grid, buoyancy, coriolis)
         end
     end
 
@@ -567,17 +525,15 @@ end
 end
 
 Ldᵢ = Field((Center, Center, Nothing), grid)
-buoyancy = model.buoyancy
-tracers = model.tracers
-set!(tracers.b, bᵢ)
-coriolis = model.coriolis
 
-@apply_regionally launch!(arch, grid, :xy, _calculate_deformation_radius!, Ldᵢ, grid, tracers, buoyancy, coriolis)
+@apply_regionally launch!(arch, grid, :xy, _calculate_deformation_radius!, Ldᵢ, grid, bᵢ, coriolis)
 Ldᵢ_minimum = minimum(Ldᵢ)
 @apply_regionally launch!(arch, grid, :xy, _truncate_deformation_radius!, Ldᵢ, grid, Ldᵢ_minimum)
 Ldᵢ_at_specific_longitude_through_panel_center = zeros(2*Nx, 4);
 
-plot_initial_field = true
+plot_initial_field = false
+make_geo_heatlatlon_plots = true
+
 if plot_initial_field
     fig = panel_wise_visualization(grid_cpu, on_architecture(CPU(), τ_x); k = 1, common_kwargs...)
     save("cubed_sphere_aquaplanet_zonal_wind_stress.png", fig)
@@ -588,20 +544,22 @@ if plot_initial_field
     fig = panel_wise_visualization(grid_cpu, on_architecture(CPU(), τ_y_r); k = 1, common_kwargs...)
     save("cubed_sphere_aquaplanet_meridional_wind_stress_reconstructed.png", fig)
 
-    title = "Zonal wind stress"
-    fig = geo_heatlatlon_visualization(grid_cpu, on_architecture(CPU(), τ_x), title; levels = 1:1, common_kwargs...,
-                                       cbar_label = "zonal wind stress (N m⁻²)")
-    save("cubed_sphere_aquaplanet_zonal_wind_stress_geo_heatlatlon_plot.png", fig)
-
-    title = "Reconstructed zonal wind stress"
-    fig = geo_heatlatlon_visualization(grid_cpu, on_architecture(CPU(), τ_x_r), title; levels = 1:1, common_kwargs...,
-                                       cbar_label = "zonal wind stress (N m⁻²)")
-    save("cubed_sphere_aquaplanet_zonal_wind_stress_reconstructed_geo_heatlatlon_plot.png", fig)
-
-    title = "Reconstructed meridional wind stress"
-    fig = geo_heatlatlon_visualization(grid_cpu, on_architecture(CPU(), τ_y_r), title; levels = 1:1, common_kwargs...,
-                                       cbar_label = "meridional wind stress (N m⁻²)")
-    save("cubed_sphere_aquaplanet_meridional_wind_stress_reconstructed_geo_heatlatlon_plot.png", fig)
+    if make_geo_heatlatlon_plots
+        title = "Zonal wind stress"
+        fig = geo_heatlatlon_visualization(grid_cpu, on_architecture(CPU(), τ_x), title; levels = 1:1, common_kwargs...,
+                                           cbar_label = "zonal wind stress (N m⁻²)")
+        save("cubed_sphere_aquaplanet_zonal_wind_stress_geo_heatlatlon_plot.png", fig)
+
+        title = "Reconstructed zonal wind stress"
+        fig = geo_heatlatlon_visualization(grid_cpu, on_architecture(CPU(), τ_x_r), title; levels = 1:1,
+                                           common_kwargs..., cbar_label = "zonal wind stress (N m⁻²)")
+        save("cubed_sphere_aquaplanet_zonal_wind_stress_reconstructed_geo_heatlatlon_plot.png", fig)
+
+        title = "Reconstructed meridional wind stress"
+        fig = geo_heatlatlon_visualization(grid_cpu, on_architecture(CPU(), τ_y_r), title; levels = 1:1,
+                                           common_kwargs..., cbar_label = "meridional wind stress (N m⁻²)")
+        save("cubed_sphere_aquaplanet_meridional_wind_stress_reconstructed_geo_heatlatlon_plot.png", fig)
+    end
 
     if initialize_velocities_based_on_thermal_wind_balance
         uᵢ, vᵢ = orient_velocities_in_global_direction(grid_cpu, uᵢ, vᵢ, cos_θ, sin_θ; levels = 1:Nz)
@@ -616,21 +574,22 @@ if plot_initial_field
 
         fig = panel_wise_visualization(grid_cpu, ζᵢ; k = Nz, common_kwargs...)
         save("cubed_sphere_aquaplanet_ζᵢ.png", fig)
-        
-        title = "Initial zonal velocity"
-        fig = geo_heatlatlon_visualization(grid_cpu, uᵢ, title; k = Nz, common_kwargs...,
-                                           cbar_label = "zonal velocity (m s⁻¹)")
-        save("cubed_sphere_aquaplanet_uᵢ_geo_heatlatlon_plot.png", fig)
-
-        title = "Initial meridional velocity"
-        fig = geo_heatlatlon_visualization(grid_cpu, vᵢ, title; k = Nz, common_kwargs...,
-                                           cbar_label = "meridional velocity (m s⁻¹)")
-        save("cubed_sphere_aquaplanet_vᵢ_geo_heatlatlon_plot.png", fig)
-
-        title = "Initial relative vorticity"
-        fig = geo_heatlatlon_visualization(grid_cpu, ζᵢ, title; k = Nz, common_kwargs...,
-                                           cbar_label = "relative vorticity (s⁻¹)")
-        save("cubed_sphere_aquaplanet_ζᵢ_geo_heatlatlon_plot.png", fig)
+        if make_geo_heatlatlon_plots
+            title = "Initial zonal velocity"
+            fig = geo_heatlatlon_visualization(grid_cpu, uᵢ, title; k = Nz, common_kwargs...,
+                                               cbar_label = "zonal velocity (m s⁻¹)")
+            save("cubed_sphere_aquaplanet_uᵢ_geo_heatlatlon_plot.png", fig)
+
+            title = "Initial meridional velocity"
+            fig = geo_heatlatlon_visualization(grid_cpu, vᵢ, title; k = Nz, common_kwargs...,
+                                               cbar_label = "meridional velocity (m s⁻¹)")
+            save("cubed_sphere_aquaplanet_vᵢ_geo_heatlatlon_plot.png", fig)
+
+            title = "Initial relative vorticity"
+            fig = geo_heatlatlon_visualization(grid_cpu, ζᵢ, title; k = Nz, common_kwargs...,
+                                               cbar_label = "relative vorticity (s⁻¹)")
+            save("cubed_sphere_aquaplanet_ζᵢ_geo_heatlatlon_plot.png", fig)
+        end
 
         index, panel_index = 1, 1
         
@@ -673,16 +632,18 @@ if plot_initial_field
     fig = panel_wise_visualization(grid_cpu, on_architecture(CPU(), Ldᵢ); k = 1, common_kwargs_positive_scalar...)
     save("cubed_sphere_aquaplanet_Ldᵢ.png", fig)
     
-    title = "Initial buoyancy"
-    fig = geo_heatlatlon_visualization(grid_cpu, bᵢ, title; k = b_index, common_kwargs...,
-                                       cbar_label = "buoyancy (m s⁻²)")
-    save("cubed_sphere_aquaplanet_bᵢ_geo_heatlatlon_plot.png", fig)
-    
-    title = "Deformation radius"
-    fig = geo_heatlatlon_visualization(grid_cpu, on_architecture(CPU(), Ldᵢ), title; levels = 1:1,
-                                       common_kwargs_positive_scalar..., cbar_label = "deformation radius (m)")
-    save("cubed_sphere_aquaplanet_Ldᵢ_geo_heatlatlon_plot.png", fig)
-    
+    if make_geo_heatlatlon_plots
+        title = "Initial buoyancy"
+        fig = geo_heatlatlon_visualization(grid_cpu, bᵢ, title; k = b_index, common_kwargs...,
+                                           cbar_label = "buoyancy (m s⁻²)")
+        save("cubed_sphere_aquaplanet_bᵢ_geo_heatlatlon_plot.png", fig)
+
+        title = "Deformation radius"
+        fig = geo_heatlatlon_visualization(grid_cpu, on_architecture(CPU(), Ldᵢ), title; levels = 1:1,
+                                           common_kwargs_positive_scalar..., cbar_label = "deformation radius (m)")
+        save("cubed_sphere_aquaplanet_Ldᵢ_geo_heatlatlon_plot.png", fig)
+    end
+
     index, panel_index = 1, 1
     
     bᵢ_at_specific_longitude_through_panel_center[:, :, index] = (
@@ -726,85 +687,107 @@ compute_vorticity!(grid_cpu, u_f, v_f, ζ)
 w_f = file_c["w/data"]
 w_f = set_parent_field_data(grid_cpu, w_f, "cc"; levels = 1:Nz+1)
 
-η_f = file_c["η/data"]
-η_f = set_parent_field_data(grid_cpu, η_f, "cc"; ssh = true)
+if extended_halos
+    η_f_extended_halos = file_c["η/data"]
+    Hc = grid.Hx
+    Hc_extended = (size(η_f_extended_halos[1], 1) - Nc) ÷ 2
+    η_f = Field((Center, Center, Center), grid; indices = (:, :, Nz+1:Nz+1))
+    for region in 1:6, j in 1:Nc+2Hc, i in 1:Nc+2Hc
+        η_f[region][i, j, Nz+1] = η_f_extended_halos[region][i+Hc_extended, j+Hc_extended, 1]
+    end
+else
+    η_f = file_c["η/data"]
+    η_f = set_parent_field_data(grid_cpu, η_f, "cc"; ssh = true)
+end
 
 b_f = file_c["b/data"]
 b_f = set_parent_field_data(grid_cpu, b_f, "cc"; levels = 1:Nz)
-set!(tracers.b, b_f)
 
 Ld_f = Field((Center, Center, Nothing), grid)
-@apply_regionally launch!(arch, grid, :xy, _calculate_deformation_radius!, Ld_f, grid, tracers, buoyancy, coriolis)
+@apply_regionally launch!(arch, grid, :xy, _calculate_deformation_radius!, Ld_f, grid, b_f, coriolis)
 Ld_f_minimum = minimum(Ld_f)
 @apply_regionally launch!(arch, grid, :xy, _truncate_deformation_radius!, Ld_f, grid, Ld_f_minimum)
 Ld_f_at_specific_longitude_through_panel_center = zeros(2*Nx, 4);
 
 Δt = 5minutes
 simulation_time = iteration_id * Δt
-
-fig = panel_wise_visualization(grid_cpu, u_f_r; k = Nz, common_kwargs..., specify_plot_limits = true,
-                               plot_limits = (-3.5, 3.5))
+specify_plot_limits = true
+specify_η_limits = false
+specify_b_limits = false
+
+u_limits = (-0.75, 0.75)
+v_limits = (-0.25, 0.25)
+ζ_limits = (-5e-6, 5e-6)
+w_limits = (-5e-5, 5e-5)
+η_limits = (-10, 10)
+b_limits = (-0.0325, 0.0325)
+
+fig = panel_wise_visualization(grid_cpu, u_f_r; k = Nz, common_kwargs..., specify_plot_limits = specify_plot_limits,
+                               plot_limits = u_limits)
 save("cubed_sphere_aquaplanet_u_f_$iteration_id.png", fig)
 
-fig = panel_wise_visualization(grid_cpu, v_f_r; k = Nz, common_kwargs..., specify_plot_limits = true,
-                               plot_limits = (-2.25, 2.25))
+fig = panel_wise_visualization(grid_cpu, v_f_r; k = Nz, common_kwargs..., specify_plot_limits = specify_plot_limits,
+                               plot_limits = v_limits)
 save("cubed_sphere_aquaplanet_v_f_$iteration_id.png", fig)
 
-fig = panel_wise_visualization(grid_cpu, ζ_f; k = Nz, common_kwargs..., specify_plot_limits = true,
-                               plot_limits = (-1.25e-5, 1.25e-5))
+fig = panel_wise_visualization(grid_cpu, ζ_f; k = Nz, common_kwargs..., specify_plot_limits = specify_plot_limits,
+                               plot_limits = ζ_limits)
 save("cubed_sphere_aquaplanet_ζ_f_$iteration_id.png", fig)
 
-fig = panel_wise_visualization(grid_cpu, w_f; k = w_index, common_kwargs..., specify_plot_limits = true,
-                               plot_limits = (-2e-4, 2e-4))
+fig = panel_wise_visualization(grid_cpu, w_f; k = w_index, common_kwargs..., specify_plot_limits = specify_plot_limits,
+                               plot_limits = w_limits)
 save("cubed_sphere_aquaplanet_w_f_$iteration_id.png", fig)
 
-fig = panel_wise_visualization(grid_cpu, η_f; ssh = true, common_kwargs_η..., specify_plot_limits = true,
-                               plot_limits = (-15, 15))
+fig = panel_wise_visualization(grid_cpu, η_f; ssh = true, common_kwargs_η..., specify_plot_limits = specify_η_limits,
+                               plot_limits = η_limits)
 save("cubed_sphere_aquaplanet_η_f_$iteration_id.png", fig)
 
-fig = panel_wise_visualization(grid_cpu, b_f; k = b_index, common_kwargs..., specify_plot_limits = true,
-                               plot_limits = (-0.055, 0.055))
+fig = panel_wise_visualization(grid_cpu, b_f; k = b_index, common_kwargs..., specify_plot_limits = specify_b_limits,
+                               plot_limits = b_limits)
 save("cubed_sphere_aquaplanet_b_f_$iteration_id.png", fig)
 
-title = "Zonal velocity after $(prettytime(simulation_time))"
-fig = geo_heatlatlon_visualization(grid_cpu, u_f_r, title; k = Nz, common_kwargs...,
-                                   cbar_label = "zonal velocity (m s⁻¹)",
-                                   specify_plot_limits = true, plot_limits = (-3.5, 3.5))
-save("cubed_sphere_aquaplanet_u_f_geo_heatlatlon_plot_$iteration_id.png", fig)
-
-title = "Meridional velocity after $(prettytime(simulation_time))"
-fig = geo_heatlatlon_visualization(grid_cpu, v_f_r, title; k = Nz, common_kwargs...,
-                                   cbar_label = "meridional velocity (m s⁻¹)", specify_plot_limits = true,
-                                   plot_limits = (-2.25, 2.25))
-save("cubed_sphere_aquaplanet_v_f_geo_heatlatlon_plot_$iteration_id.png", fig)
-
-title = "Relative vorticity after $(prettytime(simulation_time))"
-fig = geo_heatlatlon_visualization(grid_cpu, ζ_f, title; k = Nz, common_kwargs...,
-                                   cbar_label = "relative vorticity (s⁻¹)", specify_plot_limits = true,
-                                   plot_limits = (-1.25e-5, 1.25e-5))
-save("cubed_sphere_aquaplanet_ζ_f_geo_heatlatlon_plot_$iteration_id.png", fig)
-
-title = "Vertical velocity after $(prettytime(simulation_time))"
-fig = geo_heatlatlon_visualization(grid_cpu, w_f, title; k = w_index, common_kwargs...,
-                                   cbar_label = "vertical velocity (m s⁻¹)", specify_plot_limits = true,
-                                   plot_limits = (-2e-4, 2e-4))
-save("cubed_sphere_aquaplanet_w_f_geo_heatlatlon_plot_$iteration_id.png", fig)
-
-title = "Surface elevation after $(prettytime(simulation_time))"
-fig = geo_heatlatlon_visualization(grid_cpu, η_f, title; ssh = true, common_kwargs_η...,
-                                   cbar_label = "surface elevation (m)", specify_plot_limits = true,
-                                   plot_limits = (-15, 15))
-save("cubed_sphere_aquaplanet_η_f_geo_heatlatlon_plot_$iteration_id.png", fig)
-
-title = "Buoyancy after $(prettytime(simulation_time))"
-fig = geo_heatlatlon_visualization(grid_cpu, b_f, title; k = b_index, common_kwargs..., cbar_label = "buoyancy (m s⁻²)",
-                                   specify_plot_limits = true, plot_limits = (-0.055, 0.055))
-save("cubed_sphere_aquaplanet_b_f_geo_heatlatlon_plot_$iteration_id.png", fig)
-
-title = "Deformation radius after $(prettytime(simulation_time))"
-fig = geo_heatlatlon_visualization(grid_cpu, Ld_f, title; levels = 1:1, common_kwargs_positive_scalar...,
-                                   cbar_label = "deformation radius (m)")
-save("cubed_sphere_aquaplanet_Ld_f_geo_heatlatlon_plot_$iteration_id.png", fig)
+if make_geo_heatlatlon_plots
+    title = "Zonal velocity after $(prettytime(simulation_time))"
+    fig = geo_heatlatlon_visualization(grid_cpu, u_f_r, title; k = Nz, common_kwargs...,
+                                       cbar_label = "zonal velocity (m s⁻¹)", specify_plot_limits = specify_plot_limits,
+                                       plot_limits = u_limits)
+    save("cubed_sphere_aquaplanet_u_f_geo_heatlatlon_plot_$iteration_id.png", fig)
+
+    title = "Meridional velocity after $(prettytime(simulation_time))"
+    fig = geo_heatlatlon_visualization(grid_cpu, v_f_r, title; k = Nz, common_kwargs...,
+                                       cbar_label = "meridional velocity (m s⁻¹)",
+                                       specify_plot_limits = specify_plot_limits, plot_limits = v_limits)
+    save("cubed_sphere_aquaplanet_v_f_geo_heatlatlon_plot_$iteration_id.png", fig)
+
+    title = "Relative vorticity after $(prettytime(simulation_time))"
+    fig = geo_heatlatlon_visualization(grid_cpu, ζ_f, title; k = Nz, common_kwargs...,
+                                       cbar_label = "relative vorticity (s⁻¹)",
+                                       specify_plot_limits = specify_plot_limits, plot_limits = ζ_limits)
+    save("cubed_sphere_aquaplanet_ζ_f_geo_heatlatlon_plot_$iteration_id.png", fig)
+
+    title = "Vertical velocity after $(prettytime(simulation_time))"
+    fig = geo_heatlatlon_visualization(grid_cpu, w_f, title; k = w_index, common_kwargs...,
+                                       cbar_label = "vertical velocity (m s⁻¹)",
+                                       specify_plot_limits = specify_plot_limits, plot_limits = w_limits)
+    save("cubed_sphere_aquaplanet_w_f_geo_heatlatlon_plot_$iteration_id.png", fig)
+
+    title = "Surface elevation after $(prettytime(simulation_time))"
+    fig = geo_heatlatlon_visualization(grid_cpu, η_f, title; ssh = true, common_kwargs_η...,
+                                       cbar_label = "surface elevation (m)", specify_plot_limits = specify_η_limits,
+                                       plot_limits = η_limits)
+    save("cubed_sphere_aquaplanet_η_f_geo_heatlatlon_plot_$iteration_id.png", fig)
+
+    title = "Buoyancy after $(prettytime(simulation_time))"
+    fig = geo_heatlatlon_visualization(grid_cpu, b_f, title; k = b_index, common_kwargs...,
+                                       cbar_label = "buoyancy (m s⁻²)", specify_plot_limits = specify_b_limits,
+                                       plot_limits = b_limits)
+    save("cubed_sphere_aquaplanet_b_f_geo_heatlatlon_plot_$iteration_id.png", fig)
+
+    title = "Deformation radius after $(prettytime(simulation_time))"
+    fig = geo_heatlatlon_visualization(grid_cpu, Ld_f, title; levels = 1:1, common_kwargs_positive_scalar...,
+                                       cbar_label = "deformation radius (m)")
+    save("cubed_sphere_aquaplanet_Ld_f_geo_heatlatlon_plot_$iteration_id.png", fig)
+end
 
 close(file_c)
 
@@ -840,43 +823,35 @@ extract_field_at_specific_longitude_through_panel_center(grid_cpu, Ld_f, panel_i
 
 title = "Zonal velocity after $(prettytime(simulation_time))"
 cbar_label = "zonal velocity (m s⁻¹)"
-create_heat_map_or_contour_plot(resolution, plot_type_2D,
-                                latitude_at_specific_longitude_through_panel_center[:, index],
+create_heat_map_or_contour_plot(resolution, plot_type_2D, latitude_at_specific_longitude_through_panel_center[:, index],
                                 depths/1000, u_f_at_specific_longitude_through_panel_center[:, :, index],
                                 axis_kwargs, title, contourlevels, cbar_kwargs, cbar_label,
                                 "cubed_sphere_aquaplanet_u_f_latitude-depth_section_$(panel_index)_$(iteration_id)";
-                                specify_plot_limits = true, plot_limits = (-3.5, 3.5),
-                                common_kwargs_vertical_section...)
+                                specify_plot_limits = false, plot_limits = u_limits, common_kwargs_vertical_section...)
 
 title = "Meridional velocity after $(prettytime(simulation_time))"
 cbar_label = "meridional velocity (m s⁻¹)"
-create_heat_map_or_contour_plot(resolution, plot_type_2D,
-                                latitude_at_specific_longitude_through_panel_center[:, index],
+create_heat_map_or_contour_plot(resolution, plot_type_2D, latitude_at_specific_longitude_through_panel_center[:, index],
                                 depths/1000, v_f_at_specific_longitude_through_panel_center[:, :, index],
                                 axis_kwargs, title, contourlevels, cbar_kwargs, cbar_label,
                                 "cubed_sphere_aquaplanet_v_f_latitude-depth_section_$(panel_index)_$(iteration_id)";
-                                specify_plot_limits = true, plot_limits = (-2.25, 2.25),
-                                common_kwargs_vertical_section...)
+                                specify_plot_limits = false, plot_limits = v_limits, common_kwargs_vertical_section...)
 
 title = "Relative vorticity after $(prettytime(simulation_time))"
 cbar_label = "relative vorticity (s⁻¹)"
-create_heat_map_or_contour_plot(resolution, plot_type_2D,
-                                latitude_at_specific_longitude_through_panel_center[:, index],
+create_heat_map_or_contour_plot(resolution, plot_type_2D, latitude_at_specific_longitude_through_panel_center[:, index],
                                 depths/1000, ζ_f_at_specific_longitude_through_panel_center[:, :, index],
                                 axis_kwargs, title, contourlevels, cbar_kwargs, cbar_label,
                                 "cubed_sphere_aquaplanet_ζ_f_latitude-depth_section_$(panel_index)_$(iteration_id)";
-                                specify_plot_limits = true, plot_limits = (-1.25e-5, 1.25e-5),
-                                common_kwargs_vertical_section...)
+                                specify_plot_limits = false, plot_limits = ζ_limits, common_kwargs_vertical_section...)
 
 title = "Vertical velocity after $(prettytime(simulation_time))"
 cbar_label = "vertical velocity (s⁻¹)"
-create_heat_map_or_contour_plot(resolution, plot_type_2D,
-                                latitude_at_specific_longitude_through_panel_center[:, index],
+create_heat_map_or_contour_plot(resolution, plot_type_2D, latitude_at_specific_longitude_through_panel_center[:, index],
                                 depths_f[2:Nz+1]/1000, w_f_at_specific_longitude_through_panel_center[:, 2:Nz+1, index],
                                 axis_kwargs, title, contourlevels, cbar_kwargs, cbar_label,
                                 "cubed_sphere_aquaplanet_w_f_latitude-depth_section_$(panel_index)_$(iteration_id)";
-                                specify_plot_limits = true, plot_limits = (-2e-4, 2e-4),
-                                common_kwargs_vertical_section...)
+                                specify_plot_limits = false, plot_limits = w_limits, common_kwargs_vertical_section...)
 
 title = "Surface elevation after $(prettytime(simulation_time))"
 create_single_line_or_scatter_plot(resolution, plot_type_1D,
@@ -884,7 +859,7 @@ create_single_line_or_scatter_plot(resolution, plot_type_1D,
                                    η_f_at_specific_longitude_through_panel_center[:, 1, index], axis_kwargs_η,
                                    title, plot_kwargs,
                                    "cubed_sphere_aquaplanet_η_f_latitude_$(panel_index)_$(iteration_id)";
-                                   tight_x_axis = true, specify_y_limits = true, y_limits = [-15, 15])
+                                   tight_x_axis = true, specify_y_limits = false, y_limits = η_limits)
 
 title = "Buoyancy after $(prettytime(simulation_time))"
 cbar_label = "buoyancy (m s⁻²)"
@@ -893,8 +868,7 @@ create_heat_map_or_contour_plot(resolution, plot_type_2D,
                                 depths/1000, b_f_at_specific_longitude_through_panel_center[:, :, index],
                                 axis_kwargs, title, contourlevels, cbar_kwargs, cbar_label,
                                 "cubed_sphere_aquaplanet_b_f_latitude-depth_section_$(panel_index)_$(iteration_id)";
-                                specify_plot_limits = true, plot_limits = (-0.055, 0.055),
-                                common_kwargs_vertical_section...)
+                                specify_plot_limits = false, plot_limits = b_limits, common_kwargs_vertical_section...)
 
 title = "Deformation radius after $(prettytime(simulation_time))"
 create_single_line_or_scatter_plot(resolution, plot_type_1D,
@@ -903,3 +877,18 @@ create_single_line_or_scatter_plot(resolution, plot_type_1D,
                                    title, plot_kwargs,
                                    "cubed_sphere_aquaplanet_Ld_f_latitude_$(panel_index)_$(iteration_id)";
                                    tight_x_axis = true)
+
+if !isdir("cubed_sphere_aquaplanet_checkpointer_iteration$(iteration_id)")
+    mkdir("cubed_sphere_aquaplanet_checkpointer_iteration$(iteration_id)")
+end
+
+# List all files with the .png extension
+png_files = filter(x -> endswith(x, ".png"), readdir())
+
+# Move each .png file to the "temp" directory
+for file in png_files
+    mv(file, joinpath("cubed_sphere_aquaplanet_checkpointer_iteration$(iteration_id)", file); force=true)
+end
+
+file = "cubed_sphere_aquaplanet_checkpointer_iteration$(iteration_id).jld2"
+mv(file, joinpath("cubed_sphere_aquaplanet_checkpointer_iteration$(iteration_id)", file); force=true)
diff --git a/validation/open_boundaries/cylinder.jl b/validation/open_boundaries/cylinder.jl
new file mode 100644
index 0000000000..993a41c4c1
--- /dev/null
+++ b/validation/open_boundaries/cylinder.jl
@@ -0,0 +1,127 @@
+# This validation script shows open boundaries working in a simple case where the
+# flow remains largely unidirectional and so at one end we have no matching scheme
+# but just prescribe the inflow. At the other end we then make no assumptions about
+# the flow and use a very simple open boundary condition to permit information to 
+# exit the domain. If, for example, the flow at the prescribed boundary was reversed
+# then the model would likely fail.
+
+using Oceananigans, CairoMakie
+using Oceananigans.BoundaryConditions: FlatExtrapolationOpenBoundaryCondition
+
+
+@kwdef struct Cylinder{FT}
+    D :: FT = 1.0
+   x₀ :: FT = 0.0
+   y₀ :: FT = 0.0
+end
+
+@inline (cylinder::Cylinder)(x, y) = ifelse((x - cylinder.x₀)^2 + (y - cylinder.y₀)^2 < (cylinder.D/2)^2, 1, 0)
+
+architecture = GPU()
+
+# model parameters
+Re = 200
+U = 1
+D = 1.
+resolution = D / 40
+
+# add extra downstream distance to see if the solution near the cylinder changes
+extra_downstream = 0
+
+cylinder = Cylinder(; D)
+
+x = (-5, 5 + extra_downstream) .* D
+y = (-5, 5) .* D
+
+Ny = Int(10 / resolution)
+Nx = Ny + Int(extra_downstream / resolution)
+
+ν = U * D / Re
+
+closure = ScalarDiffusivity(;ν, κ = ν)
+
+grid = RectilinearGrid(architecture; topology = (Bounded, Periodic, Flat), size = (Nx, Ny), x, y)
+
+@inline u(y, t, U) = U * (1 + 0.01 * randn())
+
+u_boundaries = FieldBoundaryConditions(east = FlatExtrapolationOpenBoundaryCondition(),
+                                       west = OpenBoundaryCondition(u, parameters = U))
+
+v_boundaries = FieldBoundaryConditions(east = GradientBoundaryCondition(0),
+                                       west = GradientBoundaryCondition(0))
+
+Δt = .3 * resolution / U
+
+u_forcing = Relaxation(; rate = 1 / (2 * Δt), mask = cylinder)
+v_forcing = Relaxation(; rate = 1 / (2 * Δt), mask = cylinder) 
+
+model = NonhydrostaticModel(; grid, 
+                              closure, 
+                              forcing = (u = u_forcing, v = v_forcing),
+                              boundary_conditions = (u = u_boundaries, v = v_boundaries))
+
+@info "Constructed model"
+
+# initial noise to induce turbulance faster
+set!(model, u = U, v = (x, y) -> randn() * U * 0.01)
+
+@info "Set initial conditions"
+
+simulation = Simulation(model; Δt = Δt, stop_time = 300)
+
+wizard = TimeStepWizard(cfl = 0.3)
+
+simulation.callbacks[:wizard] = Callback(wizard, IterationInterval(100))
+
+progress(sim) = @info "$(time(sim)) with Δt = $(prettytime(sim.Δt)) in $(prettytime(sim.run_wall_time))"
+
+simulation.callbacks[:progress] = Callback(progress, IterationInterval(1000))
+
+simulation.output_writers[:velocity] = JLD2OutputWriter(model, model.velocities,
+                                                        overwrite_existing = true, 
+                                                        filename = "cylinder_$(extra_downstream)_Re_$Re.jld2", 
+                                                        schedule = TimeInterval(1),
+                                                        with_halos = true)
+
+run!(simulation)
+
+# load the results 
+
+u_ts = FieldTimeSeries("cylinder_$(extra_downstream)_Re_$Re.jld2", "u")
+v_ts = FieldTimeSeries("cylinder_$(extra_downstream)_Re_$Re.jld2", "v")
+
+u′, v′, w′ = Oceananigans.Fields.VelocityFields(u_ts.grid)
+
+ζ = Field((@at (Center, Center, Center) ∂x(v′)) - (@at (Center, Center, Center) ∂y(u′)))
+
+# there is probably a more memory efficient way todo this
+
+ζ_ts = zeros(size(grid, 1), size(grid, 2), length(u_ts.times)) # u_ts.grid so its always on cpu
+
+for n in 1:length(u_ts.times)
+    set!(u′, u_ts[n])
+    set!(v′, v_ts[n])
+    compute!(ζ)
+    ζ_ts[:, :, n] = interior(ζ, :, :, 1)
+end
+
+@info "Loaded results"
+
+# plot the results
+
+fig = Figure(size = (600, 600))
+
+ax = Axis(fig[1, 1], aspect = DataAspect())
+
+xc, yc, zc = nodes(ζ)
+
+n = Observable(1)
+
+ζ_plt = @lift ζ_ts[:, :, $n]
+
+contour!(ax, xc, yc, ζ_plt, levels = [-2, 2], colorrange = (-2, 2), colormap = :roma)
+
+record(fig, "ζ_Re_$Re.mp4", 1:length(u_ts.times), framerate = 5) do i;
+    n[] = i
+    i % 10 == 0 && @info "$(n.val) of $(length(u_ts.times))"
+end
\ No newline at end of file
diff --git a/validation/open_boundaries/oscillating_flow.jl b/validation/open_boundaries/oscillating_flow.jl
new file mode 100644
index 0000000000..6bf48337ea
--- /dev/null
+++ b/validation/open_boundaries/oscillating_flow.jl
@@ -0,0 +1,154 @@
+# This validation script shows open boundaries working in a simple case where the
+# oscillates sinusoidally so changes sign across two open boundaries. This is similar
+# to a more realistic case where we know some arbitary external conditions. 
+# This necessitates using a combination allowing information to exit the domain, in 
+# this case by setting the wall normal velocity gradient to zero, as well as forcing
+# to the external value in this example by relaxing to it.
+
+# This case also has a stretched grid to validate the zero wall normal velocity 
+# gradient matching scheme on a stretched grid.
+
+using Oceananigans, CairoMakie
+using Oceananigans.BoundaryConditions: FlatExtrapolationOpenBoundaryCondition
+
+@kwdef struct Cylinder{FT}
+    D :: FT = 1.0
+   x₀ :: FT = 0.0
+   y₀ :: FT = 0.0
+end
+
+@inline (cylinder::Cylinder)(x, y) = ifelse((x - cylinder.x₀)^2 + (y - cylinder.y₀)^2 < (cylinder.D/2)^2, 1, 0)
+
+architecture = CPU()
+
+# model parameters
+Re = 200
+U = 1
+D = 1.
+resolution = D / 10
+
+# add extra downstream distance to see if the solution near the cylinder changes
+extra_downstream = 0
+
+cylinder = Cylinder(; D)
+
+x = (-5, 5 + extra_downstream) .* D
+
+Ny = Int(10 / resolution)
+
+Nx = Int((10 + extra_downstream) / resolution)
+
+function Δy(j)
+    if Ny/2 - 2/resolution < j < Ny/2 + 2/resolution
+        return resolution
+    elseif j <= Ny/2 - 2/resolution 
+        return resolution * (1 + (Ny/2 - 2/resolution - j) / (Ny/2 - 2/resolution))
+    elseif j >= Ny/2 + 2/resolution
+        return resolution * (1 + (j - Ny/2 - 2/resolution) / (Ny/2 - 2/resolution))
+    else
+        Throw(ArgumentError("$j not in range"))
+    end
+end
+
+y(j) = sum(Δy.([1:j;])) - sum(Δy.([1:Ny;]))/2
+
+ν = U * D / Re
+
+closure = ScalarDiffusivity(;ν, κ = ν)
+
+grid = RectilinearGrid(architecture; topology = (Bounded, Bounded, Flat), size = (Nx, Ny), x = y, y = x)
+
+T = 20 / U
+
+@inline u(t, p)      = p.U * sin(t * 2π / p.T)
+@inline u(y, t, p)   = u(t, p)
+
+relaxation_timescale = 0.15
+
+u_boundaries = FieldBoundaryConditions(east = FlatExtrapolationOpenBoundaryCondition(u; relaxation_timescale, parameters = (; U, T)),
+                                       west = FlatExtrapolationOpenBoundaryCondition(u; relaxation_timescale, parameters = (; U, T)),
+                                       south = GradientBoundaryCondition(0),
+                                       north = GradientBoundaryCondition(0))
+
+v_boundaries = FieldBoundaryConditions(east = GradientBoundaryCondition(0),
+                                       west = GradientBoundaryCondition(0),
+                                       south = FlatExtrapolationOpenBoundaryCondition(0; relaxation_timescale),
+                                       north = FlatExtrapolationOpenBoundaryCondition(0; relaxation_timescale))
+
+Δt = .3 * resolution / U
+
+@show Δt
+
+u_forcing = Relaxation(; rate = 1 / (2 * Δt), mask = cylinder)
+v_forcing = Relaxation(; rate = 1 / (2 * Δt), mask = cylinder) 
+
+model = NonhydrostaticModel(; grid, 
+                              closure, 
+                              forcing = (u = u_forcing, v = v_forcing),
+                              boundary_conditions = (u = u_boundaries, v = v_boundaries))
+
+@info "Constructed model"
+
+# initial noise to induce turbulance faster
+set!(model, u = (x, y) -> randn() * U * 0.01, v = (x, y) -> randn() * U * 0.01)
+
+@info "Set initial conditions"
+
+simulation = Simulation(model; Δt = Δt, stop_time = 300)
+
+wizard = TimeStepWizard(cfl = 0.3, max_Δt = Δt)
+
+simulation.callbacks[:wizard] = Callback(wizard, IterationInterval(100))
+
+progress(sim) = @info "$(time(sim)) with Δt = $(prettytime(sim.Δt)) in $(prettytime(sim.run_wall_time))"
+
+simulation.callbacks[:progress] = Callback(progress, IterationInterval(1000))
+
+simulation.output_writers[:velocity] = JLD2OutputWriter(model, model.velocities,
+                                                        overwrite_existing = true, 
+                                                        filename = "oscillating_cylinder_$(extra_downstream)_Re_$Re.jld2", 
+                                                        schedule = TimeInterval(1),
+                                                        with_halos = true)
+
+run!(simulation)
+
+# load the results 
+
+u_ts = FieldTimeSeries("oscillating_cylinder_$(extra_downstream)_Re_$Re.jld2", "u")
+v_ts = FieldTimeSeries("oscillating_cylinder_$(extra_downstream)_Re_$Re.jld2", "v")
+
+u′, v′, w′ = Oceananigans.Fields.VelocityFields(u_ts.grid)
+
+ζ = Field((@at (Center, Center, Center) ∂x(v′)) - (@at (Center, Center, Center) ∂y(u′)))
+
+# there is probably a more memory efficient way todo this
+
+ζ_ts = zeros(size(grid, 1), size(grid, 2), length(u_ts.times)) # u_ts.grid so its always on cpu
+
+for n in 1:length(u_ts.times)
+    set!(u′, u_ts[n])
+    set!(v′, v_ts[n])
+    compute!(ζ)
+    ζ_ts[:, :, n] = interior(ζ, :, :, 1)
+end
+
+@info "Loaded results"
+
+# plot the results
+
+fig = Figure(size = (600, 600))
+
+xc, yc, zc = nodes(ζ)
+
+ax = Axis(fig[1, 1], aspect = DataAspect(), limits = (minimum(xc), maximum(xc), minimum(yc), maximum(yc)))
+
+n = Observable(1)
+
+ζ_plt = @lift ζ_ts[:, :, $n]
+
+contour!(ax, xc, yc, ζ_plt, levels = [-1, 1], colorrange = (-1, 1), colormap = :roma)
+
+record(fig, "oscillating_ζ_Re_$(Re)_no_exterior.mp4", 1:length(u_ts.times), framerate = 5) do i;
+    n[] = i
+    i % 10 == 0 && @info "$(n.val) of $(length(u_ts.times))"
+end
\ No newline at end of file
diff --git a/validation/vertical_mixing_closures/column_windy_convection.jl b/validation/vertical_mixing_closures/column_windy_convection.jl
index 16a5b60446..2dc96f17ba 100644
--- a/validation/vertical_mixing_closures/column_windy_convection.jl
+++ b/validation/vertical_mixing_closures/column_windy_convection.jl
@@ -5,12 +5,11 @@ using Printf
 
 using Oceananigans.TurbulenceClosures:
     RiBasedVerticalDiffusivity,
-    CATKEVerticalDiffusivity,
-    ConvectiveAdjustmentVerticalDiffusivity,
-    ExplicitTimeDiscretization
+    TKEDissipationVerticalDiffusivity,
+    CATKEVerticalDiffusivity
 
 # Parameters
-Δz = 1          # Vertical resolution
+Δz = 2          # Vertical resolution
 Lz = 256        # Extent of vertical domain
 Nz = Int(Lz/Δz) # Vertical resolution
 f₀ = 1e-4       # Coriolis parameter (s⁻¹)
@@ -19,7 +18,7 @@ Jᵇ = +1e-7      # Surface buoyancy flux (m² s⁻³)
 τˣ = -2e-3      # Surface kinematic momentum flux (m s⁻¹)
 stop_time = 4days
 
-convective_adjustment = ConvectiveAdjustmentVerticalDiffusivity(convective_κz=0.1, convective_νz=0.01)
+tke_dissipation = TKEDissipationVerticalDiffusivity()
 catke = CATKEVerticalDiffusivity()
 ri_based = RiBasedVerticalDiffusivity()
 
@@ -29,19 +28,36 @@ grid = RectilinearGrid(size=Nz, z=(-Lz, 0), topology=(Flat, Flat, Bounded))
 coriolis = FPlane(f=f₀)
 b_bcs = FieldBoundaryConditions(top = FluxBoundaryCondition(Jᵇ))
 u_bcs = FieldBoundaryConditions(top = FluxBoundaryCondition(τˣ))
-closures_to_run = [catke] #, ri_based, convective_adjustment]
+closures_to_run = [tke_dissipation, catke]
+closure = tke_dissipation
+
+function progress(sim)
+    model = sim.model
+
+    msg = @sprintf("Iter: % 5d, time: % 16s, max(b): %6.2e",
+                   iteration(sim), prettytime(sim), maximum(model.tracers.b))
+
+    msg *= @sprintf(", max(κ): %6.2e, max(e): %6.2e, max(ϵ): %6.2e",
+                    maximum(model.diffusivity_fields.κc),
+                    maximum(model.tracers.e),
+                    maximum(model.tracers.ϵ))
+
+    @info msg
+
+    return nothing
+end
 
 for closure in closures_to_run
 
     model = HydrostaticFreeSurfaceModel(; grid, closure, coriolis,
-                                        tracers = (:b, :e),
+                                        tracers = (:b, :e, :ϵ),
                                         buoyancy = BuoyancyTracer(),
                                         boundary_conditions = (; b=b_bcs, u=u_bcs))
                                         
     bᵢ(z) = N² * z
     set!(model, b=bᵢ, e=1e-6)
 
-    simulation = Simulation(model; Δt=20minutes, stop_time)
+    simulation = Simulation(model; Δt=10.0, stop_time)
 
     closurename = string(nameof(typeof(closure)))
 
@@ -50,16 +66,18 @@ for closure in closures_to_run
 
     outputs = merge(model.velocities, model.tracers, diffusivities)
 
-    simulation.output_writers[:fields] = JLD2OutputWriter(model, outputs,
-                                                          schedule = TimeInterval(20minutes),
-                                                          filename = "windy_convection_" * closurename,
-                                                          overwrite_existing = true)
+    output_writer = JLD2OutputWriter(model, outputs,
+                                     schedule = TimeInterval(20minutes),
+                                     filename = "windy_convection_" * closurename,
+                                     overwrite_existing = true)
 
-    progress(sim) = @info string("Iter: ", iteration(sim), " t: ", prettytime(sim),
-                                 ", max(b): ", maximum(model.tracers.b))
-    simulation.callbacks[:progress] = Callback(progress, IterationInterval(100))
+    simulation.output_writers[:fields] = output_writer
+
+    
+    add_callback!(simulation, progress, IterationInterval(10))
 
-    @info "Running a simulation of $model..."
+    @info "Running a simulation of "
+    @info "$model"
 
     run!(simulation)
 end
@@ -92,13 +110,12 @@ end
 b1 = first(b_ts)
 e1 = first(e_ts)
 κ1 = first(κc_ts)
-@show maximum(e1)
 
 zc = znodes(b1)
 zf = znodes(κ1)
 Nt = length(b1.times)
 
-fig = Figure(size=(1800, 600))
+fig = Figure(size=(1400, 600))
 
 slider = Slider(fig[2, 1:4], range=1:Nt, startvalue=1)
 n = slider.value
@@ -114,9 +131,9 @@ axe = Axis(fig[1, 3], xlabel=TKE_label, ylabel="z (m)")
 axκ = Axis(fig[1, 4], xlabel=diffusivities_label, ylabel="z (m)")
 
 xlims!(axb, -grid.Lz * N², 0)
-xlims!(axu, -0.1, 0.1)
-xlims!(axe, -1e-4, 2e-4)
-xlims!(axκ, -1e-1, 5e-1)
+xlims!(axu, -0.2, 0.2)
+xlims!(axe, -1e-4, 1e-2)
+xlims!(axκ, -1e-1, 1e1)
 
 colors = [:black, :blue, :red, :orange]
 
diff --git a/validation/vertical_mixing_closures/compare_catke_k_epsilon.jl b/validation/vertical_mixing_closures/compare_catke_k_epsilon.jl
new file mode 100644
index 0000000000..d54033ea8f
--- /dev/null
+++ b/validation/vertical_mixing_closures/compare_catke_k_epsilon.jl
@@ -0,0 +1,114 @@
+using Oceananigans
+using Oceananigans.Units
+
+using Oceananigans.TurbulenceClosures.TKEBasedVerticalDiffusivities:
+    TKEDissipationVerticalDiffusivity,
+    CATKEVerticalDiffusivity,
+    VariableStabilityFunctions,
+    minimum_stratification_number,
+    maximum_shear_number,
+    ConstantStabilityFunctions,
+    stratification_numberᶜᶜᶠ,
+    shear_numberᶜᶜᶠ
+
+using GLMakie
+using Printf
+
+grid = RectilinearGrid(size=128, z=(-128, 0), topology=(Flat, Flat, Bounded))
+
+f = 0
+N² = 1e-5
+τˣ = -1e-4
+Jᵇ = 1e-7
+
+u_top_bc = FluxBoundaryCondition(τˣ)
+u_bcs = FieldBoundaryConditions(top=u_top_bc)
+
+b_top_bc = FluxBoundaryCondition(Jᵇ)
+b_bcs = FieldBoundaryConditions(top=b_top_bc)
+
+coriolis = FPlane(; f)
+
+const_stability_functions = ConstantStabilityFunctions()
+k_epsilon = TKEDissipationVerticalDiffusivity()
+k_epsilon_const_stability = TKEDissipationVerticalDiffusivity(stability_functions=const_stability_functions)
+catke = CATKEVerticalDiffusivity()
+
+bᵢ(z) = N² * z
+
+bn = []
+un = []
+vn = []
+en = []
+ϵn = []
+
+#stratification_numberᶜᶜᶠ(i, j, k, grid, closure, tracers, buoyancy)
+
+for closure in (k_epsilon, catke) #, k_epsilon_const_stability)
+    global model
+
+    model = HydrostaticFreeSurfaceModel(; grid, closure, coriolis,
+                                        tracers = (:b, :e, :ϵ),
+                                        buoyancy = BuoyancyTracer(),
+                                        boundary_conditions=(u=u_bcs, b=b_bcs))
+
+    set!(model, b=bᵢ)
+
+    simulation = Simulation(model, Δt=60.0, stop_time=1day)
+
+    local b, u, v, w, e, ϵ
+
+    u, v, w = model.velocities
+    e = model.tracers.e
+    ϵ = model.tracers.ϵ
+    b = model.tracers.b
+
+    progress(sim) = @info @sprintf("Iter: % 4d, time: % 24s, max(e): %6.2e, max(ϵ): %6.2e",
+                                   iteration(sim), prettytime(sim), maximum(e), maximum(ϵ))
+    
+    add_callback!(simulation, progress, IterationInterval(100))
+
+    run!(simulation)
+
+    push!(bn, deepcopy(interior(b, 1, 1, :)))
+    push!(un, deepcopy(interior(u, 1, 1, :)))
+    push!(vn, deepcopy(interior(v, 1, 1, :)))
+    push!(en, deepcopy(interior(e, 1, 1, :)))
+    push!(ϵn, deepcopy(interior(ϵ, 1, 1, :)))
+end
+
+z = znodes(model.tracers.e)
+
+colors = [:black, :blue, :tomato]
+
+fig = Figure(size=(800, 400))
+
+axb = Axis(fig[1, 1], title="Velocities")
+axu = Axis(fig[1, 2], title="Velocities")
+axe = Axis(fig[1, 3], title="TKE")
+axϵ = Axis(fig[1, 4], title="Epsilon")
+
+lines!(axb, bn[1], z, color=colors[1])
+lines!(axu, un[1], z, color=colors[1], label="u")
+lines!(axu, vn[1], z, color=colors[1], linestyle=:dash, label="v")
+lines!(axe, en[1], z, color=colors[1], label="k-ϵ")
+lines!(axϵ, ϵn[1], z, color=colors[1])
+
+lines!(axb, bn[2], z, color=colors[2])
+lines!(axu, un[2], z, color=colors[2], label="u")
+lines!(axu, vn[2], z, color=colors[2], linestyle=:dash, label="v")
+lines!(axe, en[2], z, color=colors[2], label="CATKE")
+lines!(axϵ, ϵn[2], z, color=colors[2])
+
+#=
+lines!(axb, bn[3], z, color=colors[3])
+lines!(axu, un[3], z, color=colors[3], label="u")
+lines!(axu, vn[3], z, color=colors[3], linestyle=:dash, label="v")
+lines!(axe, en[3], z, color=colors[3], label="k-ϵ (constant stability functions)")
+lines!(axϵ, ϵn[3], z, color=colors[3])
+=#
+
+Legend(fig[0, 1:4], axe, nbanks=3, framevisible=false, tellheight=true)
+
+fig
+
diff --git a/validation/vertical_mixing_closures/k_epsilon_boundary_layer.jl b/validation/vertical_mixing_closures/k_epsilon_boundary_layer.jl
new file mode 100644
index 0000000000..cb3506a0c2
--- /dev/null
+++ b/validation/vertical_mixing_closures/k_epsilon_boundary_layer.jl
@@ -0,0 +1,123 @@
+using Oceananigans
+using Oceananigans.Units
+
+using Oceananigans.TurbulenceClosures.TKEBasedVerticalDiffusivities:
+    TKEDissipationVerticalDiffusivity,
+    CATKEVerticalDiffusivity,
+    VariableStabilityFunctions,
+    minimum_stratification_number,
+    maximum_shear_number,
+    ConstantStabilityFunctions,
+    stratification_numberᶜᶜᶠ,
+    shear_numberᶜᶜᶠ,
+    momentum_stability_functionᶜᶜᶠ,
+    tracer_stability_functionᶜᶜᶠ
+
+using GLMakie
+using Printf
+
+grid = RectilinearGrid(size=256, z=(-128, 0), topology=(Flat, Flat, Bounded))
+
+f = 1e-4
+N² = 1e-5
+τˣ = -1e-4
+Jᵇ = 5e-8
+
+u_top_bc = FluxBoundaryCondition(τˣ)
+u_bcs = FieldBoundaryConditions(top=u_top_bc)
+
+b_top_bc = FluxBoundaryCondition(Jᵇ)
+b_bcs = FieldBoundaryConditions(top=b_top_bc)
+
+coriolis = FPlane(; f)
+
+closure = TKEDissipationVerticalDiffusivity()
+#closure = CATKEVerticalDiffusivity()
+
+model = HydrostaticFreeSurfaceModel(; grid, closure, coriolis,
+                                    tracers = (:b, :e, :ϵ),
+                                    buoyancy = BuoyancyTracer(),
+                                    boundary_conditions=(u=u_bcs, b=b_bcs))
+
+bᵢ(z) = N² * z
+set!(model, b=bᵢ)
+
+simulation = Simulation(model, Δt=60.0, stop_time=1day)
+
+u, v, w = model.velocities
+e = model.tracers.e
+ϵ = model.tracers.ϵ
+b = model.tracers.b
+
+tracers = model.tracers
+buoyancy = model.buoyancy
+velocities = model.velocities 
+
+κc = model.diffusivity_fields.κc
+κe = model.diffusivity_fields.κe
+κϵ = model.diffusivity_fields.κϵ
+
+progress(sim) = @info @sprintf("Iter: % 4d, time: % 24s, max(e): %6.2e, extrema(ϵ): (%6.2e, %6.2e)",
+                               iteration(sim), prettytime(sim), maximum(e), minimum(ϵ), maximum(ϵ))
+
+add_callback!(simulation, progress, IterationInterval(100))
+
+run!(simulation)
+
+κcn = interior(κc, 1, 1, :)
+bn = interior(b, 1, 1, :)
+un = interior(u, 1, 1, :)
+vn = interior(v, 1, 1, :)
+en = interior(e, 1, 1, :)
+ϵn = interior(ϵ, 1, 1, :)
+
+zc = znodes(model.tracers.e)
+zf = znodes(κc)
+
+fig = Figure(size=(1600, 400))
+
+axb = Axis(fig[1, 1], title="Buoyancy")
+axu = Axis(fig[1, 2], title="Velocities")
+axe = Axis(fig[1, 3], title="TKE")
+axϵ = Axis(fig[1, 4], title="Epsilon")
+axκ = Axis(fig[1, 5], title="Diffusivity")
+axα = Axis(fig[1, 6], title="αᴺ, αᴹ")
+axs = Axis(fig[1, 7], title="Stability functions")
+
+scatterlines!(axb, bn, zc)
+lines!(axu, un, zc, label="u")
+lines!(axu, vn, zc, label="v")
+axislegend(axu, position=:rb)
+
+lines!(axe, en, zc, label="k-ϵ")
+lines!(axϵ, ϵn, zc)
+lines!(axκ, κcn, zf)
+
+𝕊ᵘ_op = KernelFunctionOperation{Center, Center, Face}(momentum_stability_functionᶜᶜᶠ, grid, closure, velocities, tracers, buoyancy)
+𝕊ᶜ_op = KernelFunctionOperation{Center, Center, Face}(tracer_stability_functionᶜᶜᶠ, grid, closure, velocities, tracers, buoyancy)
+𝕊ᵘ = Field(𝕊ᵘ_op)
+𝕊ᶜ = Field(𝕊ᶜ_op)
+compute!(𝕊ᵘ)
+compute!(𝕊ᶜ)
+𝕊ᵘn = interior(𝕊ᵘ, 1, 1, :)
+𝕊ᶜn = interior(𝕊ᶜ, 1, 1, :)
+
+αᴺ_op = KernelFunctionOperation{Center, Center, Face}(stratification_numberᶜᶜᶠ, grid, closure, tracers, buoyancy)
+αᴹ_op = KernelFunctionOperation{Center, Center, Face}(shear_numberᶜᶜᶠ, grid, closure, velocities, tracers, buoyancy)
+αᴺ = Field(αᴺ_op)
+αᴹ = Field(αᴹ_op)
+compute!(αᴺ)
+compute!(αᴹ)
+αᴹn = interior(αᴹ, 1, 1, :)
+αᴺn = interior(αᴺ, 1, 1, :)
+scatterlines!(axα, αᴺn, zf, label="αᴺ")
+scatterlines!(axα, αᴹn, zf, label="αᴹ")
+axislegend(axα)
+xlims!(axα, -30, 50)
+
+lines!(axs, 𝕊ᵘn, zf, label="𝕊ᵘ")
+lines!(axs, 𝕊ᶜn, zf, label="𝕊ᶜ")
+axislegend(axs, position=:rb)
+
+fig
+
diff --git a/validation/vertical_mixing_closures/look_at_stability_functions.jl b/validation/vertical_mixing_closures/look_at_stability_functions.jl
new file mode 100644
index 0000000000..88f8afe025
--- /dev/null
+++ b/validation/vertical_mixing_closures/look_at_stability_functions.jl
@@ -0,0 +1,67 @@
+using Oceananigans
+
+using Oceananigans.TurbulenceClosures.TKEBasedVerticalDiffusivities:
+    TKEDissipationVerticalDiffusivity,
+    momentum_stability_function,
+    maximum_stratification_number,
+    minimum_stratification_number,
+    maximum_shear_number,
+    minimum_shear_number,
+    tracer_stability_function
+
+using GLMakie
+using Printf
+
+closure = TKEDissipationVerticalDiffusivity()
+
+αᴺmin = minimum_stratification_number(closure)
+αᴺmax = 30.0
+αᴺ = αᴺmin:0.01:αᴺmax
+
+αᴹmin = minimum_shear_number(closure)
+αᴹmax = maximum_shear_number.(Ref(closure), αᴺ)
+αᴹ = αᴹmin:0.01:maximum(αᴹmax)
+
+fig = Figure(size=(1600, 600))
+ax1 = Axis(fig[1, 1], title="Stability functions", xlabel="αᴺ", ylabel="𝕊c")
+ax2 = Axis(fig[2, 1], title="Prandtl number", xlabel="αᴺ", ylabel="𝕊c")
+ax3 = Axis(fig[1:2, 2], title="Tracer stability functions", xlabel="αᴺ", ylabel="αᴹ")
+ax4 = Axis(fig[1:2, 3], title="Momentum stability functions", xlabel="αᴺ", ylabel="αᴹ")
+ax5 = Axis(fig[1:2, 4], title="Prandtl number", xlabel="αᴺ", ylabel="αᴹ")
+
+𝕊c_max_αᴹ = tracer_stability_function.(Ref(closure), αᴺ, αᴹmax)
+𝕊u_max_αᴹ = momentum_stability_function.(Ref(closure), αᴺ, αᴹmax)
+
+𝕊c_min_αᴹ = tracer_stability_function.(Ref(closure), αᴺ, αᴹmin)
+𝕊u_min_αᴹ = momentum_stability_function.(Ref(closure), αᴺ, αᴹmin)
+
+NN = length(αᴺ)
+NM = length(αᴹ)
+𝕊c = tracer_stability_function.(Ref(closure), reshape(αᴺ, NN, 1), reshape(αᴹ, 1, NM))
+𝕊u = momentum_stability_function.(Ref(closure), reshape(αᴺ, NN, 1), reshape(αᴹ, 1, NM))
+Pr = 𝕊u ./ 𝕊c
+
+Pr_max = maximum(Pr, dims=2)[:]
+Pr_min = minimum(Pr, dims=2)[:]
+
+lines!(ax1, αᴺ, 𝕊c_max_αᴹ, label="max(αᴹ)", color=:blue)
+lines!(ax1, αᴺ, 𝕊c_min_αᴹ, label="min(αᴹ)", linestyle=:dash, color=:blue)
+
+lines!(ax1, αᴺ, 𝕊u_max_αᴹ, color=:red)
+lines!(ax1, αᴺ, 𝕊u_min_αᴹ, linestyle=:dash, color=:red)
+
+band!(ax1, αᴺ, 𝕊c_min_αᴹ, 𝕊c_max_αᴹ, label="𝕊c", color=(:blue, 0.5))
+band!(ax1, αᴺ, 𝕊u_min_αᴹ, 𝕊u_max_αᴹ, label="𝕊u", color=(:red, 0.5))
+axislegend(ax1)
+
+band!(ax2, αᴺ, Pr_min, Pr_max, label="Pr", color=(:blue, 0.5))
+
+cf = contourf!(ax3, αᴺ, αᴹ, 𝕊c, levels=0.01:0.03:0.2, colorrrange=(0.01, 0.2))
+cf = contourf!(ax4, αᴺ, αᴹ, 𝕊u, levels=0.01:0.03:0.2, colorrrange=(0.01, 0.2))
+Colorbar(fig[3, 2:3], cf, vertical=false, tellwidth=false, label="Stability functions", flipaxis=false)
+
+cf = contourf!(ax6, αᴺ, αᴹ, Pr, levels=0.3:0.1:3.0, colorrrange=(0.35, 2.8), colormap=:solar)
+Colorbar(fig[3, 4], cf, vertical=false, tellwidth=false, label="Prandtl number", flipaxis=false)
+
+display(fig)
+
diff --git a/validation/vertical_mixing_closures/tupled_vertical_diffusion.jl b/validation/vertical_mixing_closures/tupled_vertical_diffusion.jl
index 7fb859fbf7..8c47a36953 100644
--- a/validation/vertical_mixing_closures/tupled_vertical_diffusion.jl
+++ b/validation/vertical_mixing_closures/tupled_vertical_diffusion.jl
@@ -1,34 +1,19 @@
-using GLMakie
 using Oceananigans
 using Oceananigans.Units
+using Oceananigans.TurbulenceClosures: CATKEVerticalDiffusivity
 
-arch = CPU()
-grid = RectilinearGrid(arch, size=128, z=(-5, 5), topology=(Flat, Flat, Bounded))
+grid = RectilinearGrid(size=128, z=(-128, 0), topology=(Flat, Flat, Bounded))
 
-single_closure = VerticalScalarDiffusivity(VerticallyImplicitTimeDiscretization(), κ=1)
+closure = (VerticalScalarDiffusivity(VerticallyImplicitTimeDiscretization(), κ=1e-4), 
+           CATKEVerticalDiffusivity())
 
-tuple_closure = (VerticalScalarDiffusivity(VerticallyImplicitTimeDiscretization(), κ=1/2), 
-                 VerticalScalarDiffusivity(VerticallyImplicitTimeDiscretization(), κ=1/2))
+model = HydrostaticFreeSurfaceModel(; grid, closure,
+                                    tracers = (:b, :e),
+                                    buoyancy = BuoyancyTracer())
 
-closure = tuple_closure
-
-model = HydrostaticFreeSurfaceModel(; grid, closure, tracers=:c, buoyancy=nothing, coriolis=nothing)
-set!(model, c = (x, y, z) -> exp(-z^2))
-simulation = Simulation(model, Δt=1e-4, stop_iteration=100)
-
-c_snapshots = []
-c = model.tracers.c
-getc(sim) = push!(c_snapshots, Array(interior(c, 1, 1, :)))
-simulation.callbacks[:c] = Callback(getc, IterationInterval(10))
+bᵢ(z) = 1e-5 * z
+set!(model, b = bᵢ)
+simulation = Simulation(model, Δt=1minute, stop_iteration=10)
 
 run!(simulation)
 
-fig = Figure()
-ax = Axis(fig[1, 1])
-Nt = length(c_snapshots)
-slider = Slider(fig[2, 1], range=1:Nt, startvalue=1)
-n = slider.value
-z = znodes(c)
-c = @lift c_snapshots[$n]
-lines!(ax, c, z)
-display(fig)